Hi @djbestenergy
A little more context with a little more tests (inserting 1 Mio records in different table schemas):
General note for size comparisons it is good practice to limit the shards to 1 and run an optimize fully merging segments
create table float(num FLOAT)
create table float_io(num FLOAT INDEX OFF)
create table double(num DOUBLE)
create table double_io(num DOUBLE INDEX OFF)
create table text_float(num TEXT)
create table text_float_io(num TEXT INDEX OFF)
create table text_float_io_co(num TEXT INDEX OFF STORAGE WITH (columnstore = FALSE))
create table text_double(num TEXT)
create table text_double_io(num TEXT INDEX OFF)
create table text_double_io_co(num TEXT INDEX OFF STORAGE WITH (columnstore = FALSE))
test_size.sql
set search_path to 'size_test';
drop table if exists float;
drop table if exists double;
drop table if exists float_io;
drop table if exists double_io;
drop table if exists text_float;
drop table if exists text_float_io;
drop table if exists text_float_io_co;
drop table if exists text_double;
drop table if exists text_double_io;
drop table if exists text_double_io_co;
create table float(num FLOAT) clustered into 1 shards with ("number_of_replicas" = 0);
create table float_io(num FLOAT INDEX OFF) clustered into 1 shards with ("number_of_replicas" = 0);
create table double(num DOUBLE) clustered into 1 shards with ("number_of_replicas" = 0);
create table double_io(num DOUBLE INDEX OFF) clustered into 1 shards with ("number_of_replicas" = 0);
create table text_float(num TEXT) clustered into 1 shards with ("number_of_replicas" = 0);
create table text_float_io(num TEXT INDEX OFF) clustered into 1 shards with ("number_of_replicas" = 0);
create table text_float_io_co(num TEXT INDEX OFF STORAGE WITH (columnstore = FALSE)) clustered into 1 shards with ("number_of_replicas" = 0);
create table text_double(num TEXT) clustered into 1 shards with ("number_of_replicas" = 0);
create table text_double_io(num TEXT INDEX OFF) clustered into 1 shards with ("number_of_replicas" = 0);
create table text_double_io_co(num TEXT INDEX OFF STORAGE WITH (columnstore = FALSE)) clustered into 1 shards with ("number_of_replicas" = 0);
insert into double
select random() from generate_series(1,1000000,1);
refresh table double;
insert into float select num::FLOAT from double;
insert into float_io select num::FLOAT from double;
insert into double_io select * from double;
insert into text_float select num::FLOAT from double;
insert into text_float_io select num::FLOAT from double;
insert into text_float_io_co select num::FLOAT from double;
insert into text_double select num from double;
insert into text_double_io select num from double;
insert into text_double_io_co select num from double;
refresh table float;
refresh table float_io;
refresh table double;
refresh table double_io;
refresh table text_float;
refresh table text_float_io;
refresh table text_float_io_co;
refresh table text_double;
refresh table text_double_io;
refresh table text_double_io_co;
optimize table float WITH (max_num_segments = 1);
optimize table float_io WITH (max_num_segments = 1);
optimize table double WITH (max_num_segments = 1);
optimize table double_io WITH (max_num_segments = 1);
optimize table text_float WITH (max_num_segments = 1);
optimize table text_float_io WITH (max_num_segments = 1);
optimize table text_float_io_co WITH (max_num_segments = 1);
optimize table text_double WITH (max_num_segments = 1);
optimize table text_double_io WITH (max_num_segments = 1);
optimize table text_double_io_co WITH (max_num_segments = 1);
select table_name, round(size / POWER(1024,2))::TEXT || ' MiB' as size
from sys.segments where primary and table_schema = 'size_test' order by 1;
can be run locally by putting it in a file and execute it with crash
crash < test_size.sql
Results:
+-------------------+--------+
| table_name | size |
+-------------------+--------+
| double | 49 MiB |
| double_io | 37 MiB |
| text_double | 59 MiB |
| text_double_io | 46 MiB |
| text_double_io_co | 31 MiB |
+-------------------+--------+
| float | 31 MiB |
| float_io | 26 MiB |
| text_float | 37 MiB |
| text_float_io | 29 MiB |
| text_float_io_co | 23 MiB |
+-------------------+--------+
-- io -> index off
-- co -> column store off
i.e the storage savings from storing floating point values as text is only ~ 15%, but with significant performance impact, especially for aggregations:
echo "SELECT avg(num), count(num) from size_test.float;" | cr8 timeit --hosts localhost:4200
Runtime (in ms):
mean: 30.928 ± 0.148
min/max: 30.006 → 34.941
% echo "SELECT avg(num), count(num) from size_test.float_io;" | cr8 timeit --hosts localhost:4200
Runtime (in ms):
mean: 31.006 ± 0.150
min/max: 29.812 → 33.890
% echo "SELECT avg(num::float), count(num) from size_test.text_float;" | cr8 timeit --hosts localhost:4200
Runtime (in ms):
mean: 526.880 ± 8.488
min/max: 475.093 → 715.010
% echo "SELECT avg(num::float), count(num) from size_test.text_float_io;" | cr8 timeit --hosts localhost:4200
Runtime (in ms):
mean: 554.985 ± 12.245
min/max: 474.457 → 789.776
% echo "SELECT avg(num::float), count(num) from size_test.text_float_io_co;" | cr8 timeit --hosts localhost:4200;
Runtime (in ms):
mean: 9963.003 ± 101.061
min/max: 9122.198 → 11976.021
float_io
vs text_float_io_co
→ 321x
That being said, this test was done with completely random data. encoding/compression might work different with real-word data, that typically shows less randomness.