Given the table definitions below.
create table posts_v2 (
id varchar primary key,
channel varchar,
external_id varchar,
published_at timestamp with time zone,
created_at timestamp with time zone,
url varchar null,
title varchar,
content text,
transcript array (
object as (
start double precision,
text text
)
) null,
media array (object (dynamic)) null,
language varchar,
hashtags array (varchar) null,
comment_count bigint,
like_count bigint,
view_count bigint,
parent_external_id varchar null,
extras object(dynamic) null,
sentiment varchar check (sentiment in ('negative', 'positive', 'neutral')),
source object as (
id varchar,
description text,
name varchar,
url varchar,
domain varchar,
favicon varchar,
score double precision,
monthly_traffic bigint,
country_id varchar,
state_id varchar,
city_id varchar,
location varchar
) null,
profile object as (
id varchar,
avatar varchar,
name varchar,
username varchar,
description text,
url varchar,
links array(varchar),
location varchar,
joined_at timestamp,
verification_status varchar,
gender varchar,
followers bigint,
following bigint,
post_count integer
) null,
index keyword_ft using fulltext(title, content, transcript ['text']) with (analyzer = 'ilike_analyzer'),
index search_ft using fulltext(
source['name'],
source['url'],
profile['name'],
profile['username'],
profile['url']
) with (analyzer = 'ilike_analyzer'),
check (source['location'] in ('local', 'foreign')),
check (profile['verification_status'] in ('unknown', 'unverified', 'verified')),
check (profile['gender'] in ('male', 'female', 'unknown'))
);
create table projects_posts_v2 (
project_id varchar,
labels array (varchar),
sentiment varchar check (sentiment in ('negative', 'positive', 'neutral')),
favorited_at timestamp with time zone,
hidden_at timestamp with time zone,
post_id varchar,
channel varchar,
external_id varchar,
published_at timestamp with time zone,
created_at timestamp with time zone,
url varchar null,
title varchar,
content text,
transcript array (
object as (
start double precision,
text text
)
) null,
media array (object (dynamic)) null,
language varchar,
hashtags array (varchar) null,
comment_count bigint,
like_count bigint,
view_count bigint,
parent_external_id varchar null,
extras object(dynamic) null,
source object as (
id varchar,
description text,
name varchar,
url varchar,
domain varchar,
favicon varchar,
score double precision,
monthly_traffic bigint,
country_id varchar,
state_id varchar,
city_id varchar,
location varchar
) null,
profile object as (
id varchar,
avatar varchar,
name varchar,
username varchar,
description text,
url varchar,
links array(varchar),
location varchar,
joined_at timestamp,
verification_status varchar,
gender varchar,
followers bigint,
following bigint,
post_count integer
) null,
index keyword_ft using fulltext(title, content, transcript ['text']) with (analyzer = 'ilike_analyzer'),
index search_ft using fulltext(
source['name'],
source['url'],
profile['name'],
profile['username'],
profile['url']
) with (analyzer = 'ilike_analyzer'),
check (source['location'] in ('local', 'foreign')),
check (profile['verification_status'] in ('unknown', 'unverified', 'verified')),
check (profile['gender'] in ('male', 'female', 'unknown')),
primary key (project_id, post_id)
);
The projects_posts_v2 looks nearly identical to the posts_v2 table but has a few additional, mostly null fields. The amount of space these tables take respectively is not proportional.
The posts_v2 has 59m records that take up 98GB, whereas the projects_posts_v2 has 75m records that take up 583 GB. I would expect the projects_posts_v2 to take up only (75 / 59) * 98.
I couldn’t spot the source of this phenomenon. Would you help me figure it out?
Table comparison:
- posts_v2: 59M records, 98GB (≈1.66 KB/record)
- projects_posts_v2: 75M records, 583GB (≈7.77 KB/record)
Expected size for projects_posts_v2: (75/59) * 98 = ~124GB
Actual size: 583GB (4.7x larger than expected)
Main differences in projects_posts_v2:
- Additional columns: project_id, labels, favorited_at, hidden_at
- Composite primary key (project_id, post_id) instead of single column
- These additional fields are mostly NULL
What could explain this unexpected 4.7x size increase despite a nearly identical structure and mostly NULL additional columns?