Skip to content

Commit

Permalink
feat(tags-view-materialized): move mat_views.tags_view into db/migrat…
Browse files Browse the repository at this point in the history
…ions instead of AnalysisDB;

re-organize mat_views.tags_view to materialized views, tear down AnalyticsDB

resolves #3855
  • Loading branch information
49659410+tx0c committed May 10, 2024
1 parent 920e3b8 commit c7817ec
Showing 1 changed file with 219 additions and 0 deletions.
219 changes: 219 additions & 0 deletions db/migrations/20240515080000_create_mat_views_tags_view.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
// coverted from
// https://github.com/thematters/matters-metabase/blob/main/sql/stale-tags-create-table-view.sql
// from AnalysisDB's regular daily view format to materialized view;

const schema = 'mat_views'
const materialized_view_name = 'tags_view_materialized'

exports.up = async (knex) => {
await knex.raw(/*sql*/ `
CREATE OR REPLACE FUNCTION ${schema}.slug(input text) RETURNS text AS $f$
SELECT COALESCE(
NULLIF(trim(both '-' from regexp_replace(input, '\W+', '-', 'g')), ''),
-- NULLIF(trim(both '-' from regexp_replace(input, '[()()@''’"<>,.?;&&!│||/##、,.…:;「」《》?!\\+—\-ㅤ\s]+', '-', 'g')), ''),
NULLIF(trim(both '-' from regexp_replace(input, '\s+', '-', 'g')), ''),
input
)
$f$ LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT;
CREATE OR REPLACE FUNCTION ${schema}.is_conforming_tag(input text) RETURNS boolean AS $f$
SELECT length(input)<=40 AND input ~ '^\w+$' -- regular expression match
$f$ LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT;
DROP MATERIALIZED VIEW IF EXISTS ${schema}.${materialized_view_name} CASCADE;
WITH temp_tag AS (
SELECT id, created_at, updated_at, remark, deleted, starts_with_hash, same_as_slug, cover, description, editors, creator, owner, major_tag_id, is_major_tag, is_conforming_tag,
content_orig, COALESCE(slug, ${schema}.slug(lower(content))) AS slug
FROM (
SELECT id, created_at, updated_at, remark, deleted, cover, description, editors, creator, owner, major_tag_id, slug, (major_tag_id IS NULL OR major_tag_id=id) AS is_major_tag,
${schema}.is_conforming_tag(content),
content AS content_orig,
(starts_with(content, '#') OR starts_with(content, '#')) AS starts_with_hash,
(content = ${schema}.slug(content)) AS same_as_slug,
CASE WHEN t1.major_tag_id IS NOT NULL AND t1.major_tag_id != t1.id THEN (SELECT t2.content FROM tag t2 WHERE t2.id=t1.major_tag_id)
ELSE t1.content
END AS content -- use t2.content alias
FROM public.tag t1
) t1
), temp_article_tag_stats_by_id AS (
SELECT tag_id ::int,
COUNT(DISTINCT article_id) ::int AS tag_articles, COUNT(DISTINCT author_id) ::int AS tag_authors,
(MAX(at.created_at) + '1 day'::interval - '1 microsecond'::interval) ::date - MIN(at.created_at) ::date AS span_days
FROM public.article_tag at JOIN public.article a ON article_id=a.id AND a.state IN ('active')
GROUP BY 1
), temp_tag_uses AS (
SELECT tag_id ::int, ARRAY_AGG(DISTINCT article_id) AS article_ids, COUNT(DISTINCT article_id) ::int
-- FROM article_tag
FROM public.article_tag at JOIN public.article a ON article_id=a.id AND a.state IN ('active')
GROUP BY 1
HAVING COUNT(article_id)>0
), temp_tag_common_count AS (
SELECT t.tag_id, tag_rel_id, count_common,
tu1.count AS count_target, tu2.count AS count_rel,
(tu1.count + tu2.count - count_common) AS count_union, temp_tag.is_conforming_tag
FROM (
SELECT tu.tag_id ::int, at.tag_id ::int AS tag_rel_id, COUNT(*) ::int AS count_common
FROM temp_tag_uses tu
JOIN article_tag at ON at.article_id =ANY(tu.article_ids) AND (at.tag_id <> tu.tag_id)
GROUP BY 1, 2
) t
JOIN temp_tag_uses tu1 ON tu1.tag_id=t.tag_id
JOIN temp_tag_uses tu2 ON tu2.tag_id=t.tag_rel_id
LEFT JOIN temp_tag ON tu2.tag_id=temp_tag.id -- WHERE tag.is_conforming_tag
), temp_tag_similarity AS (
SELECT *, -- (count1+count2-count_common) AS count_union,
ROUND(count_common ::numeric / count_union, 5) AS similarity,
ROUND((count_common ::numeric / count_target + count_common ::numeric / count_rel)/2, 5) AS diff_similarity,
-- ROUND(count_common ::numeric / count_union * count_rel, 5) AS sim_to_target,
rank() OVER(PARTITION BY tag_id ORDER BY
(count_common ::numeric * count_rel / count_union) DESC, -- count_rel * similarity DESC
(count_common ::numeric / count_union) DESC, count_common DESC) -- FILTER (WHERE is_conforming_tag)
FROM temp_tag_common_count t
), temp_article_tag_rels_by_id AS (
SELECT tag_id, to_jsonb((ARRAY_AGG(
(to_jsonb(ts.*) - '{rank,tag_id,count_target,is_conforming_tag}'::text[] -- || jsonb_build_object('tag_content', temp_tag.content)
) ORDER BY rank ASC) FILTER (WHERE is_conforming_tag)
)[1:100]) AS top_rels
FROM temp_tag_similarity ts
-- JOIN temp_tag ON ts.tag_rel_id=temp_tag.id
WHERE ts.is_conforming_tag AND
rank<=130 -- extract only first 100 from
GROUP BY 1
), action_tag_stats AS (
SELECT target_id AS tag_id, action, COUNT(*) ::int AS count,
COUNT(DISTINCT user_id) ::int AS num_users,
MIN(created_at) AS earliest,
MAX(created_at) AS latest,
-- MAX(created_at) ::date - MIN(created_at) ::date AS span_days
(MAX(created_at) + '1 day'::interval - '1 microsecond'::interval) ::date - MIN(created_at) ::date AS span_days
FROM public.action_tag
GROUP BY 1, 2
), article_tag_stats AS (
SELECT t.slug, -- COUNT(*) ::int AS count,
COUNT(DISTINCT at.article_id) FILTER(WHERE a.state IN ('active')) ::int AS num_articles,
COUNT(DISTINCT a.author_id) FILTER(WHERE a.state IN ('active')) ::int AS num_authors,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '3 months'::interval AND a.state IN ('active')) ::int AS num_articles_r3m,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '3 months'::interval AND a.state IN ('active')) ::int AS num_authors_r3m,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '1 month'::interval AND a.state IN ('active')) ::int AS num_articles_r1m,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '1 month'::interval AND a.state IN ('active')) ::int AS num_authors_r1m,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '2 weeks'::interval AND a.state IN ('active')) ::int AS num_articles_r2w,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '2 weeks'::interval AND a.state IN ('active')) ::int AS num_authors_r2w,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '1 week'::interval AND a.state IN ('active')) ::int AS num_articles_r1w,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '1 week'::interval AND a.state IN ('active')) ::int AS num_authors_r1w,
(MAX(at.created_at) + '1 day'::interval - '1 microsecond'::interval) ::date - MIN(at.created_at) ::date AS span_days,
MIN(at.created_at) AS earliest_use,
MAX(at.created_at) AS latest_use,
to_jsonb(ARRAY_AGG(DISTINCT jsonb_build_object(
'id', t.id, 'tag', t.content_orig, 'id_slug', (t.id || '-' || t.slug),
'tag_articles', tag_articles, 'tag_authors', tag_authors,
'same_as_slug', t.same_as_slug, -- t.content_orig = pg_temp.slug(t.content_orig),
'starts_with_hash', t.starts_with_hash, -- (starts_with(content, '#') OR starts_with(content, '#')),
'is_major_tag', t.is_major_tag, -- t.major_tag_id=t.id,
'is_conforming_tag', t.is_conforming_tag,
'span_days', at.span_days, 'created_at', t.created_at,
'deleted', t.deleted,
'url', ('/tags/' || rtrim(encode(('Tag:' || t.id) ::bytea, 'base64'), '='))
) -- ORDER BY tag_articles DESC
)) AS dups,
to_jsonb(ARRAY_AGG(DISTINCT to_jsonb(t.*) || jsonb_build_object('editors', t.editors ::int[]))) AS details,
to_jsonb(ARRAY_AGG(DISTINCT to_jsonb(action_tag_stats.*)) FILTER (WHERE action_tag_stats.count IS NOT NULL) ) AS action_details
FROM temp_tag t
LEFT JOIN (
SELECT *
FROM public.article_tag at
JOIN temp_article_tag_stats_by_id USING (tag_id)
) at ON t.id=at.tag_id
LEFT JOIN action_tag_stats USING (tag_id)
LEFT JOIN public.article a ON at.article_id=a.id AND a.state IN ('active')
GROUP BY 1
), tag_slug_aliases AS (
SELECT slug,
ARRAY_AGG(id_slug_orig
ORDER BY is_major_tag DESC NULLS LAST, deleted ASC NULLS LAST, -- false<true<null
-- content NOT SIMILAR TO '(#|#)%' DESC,
starts_with_hash ASC NULLS LAST, -- (starts_with(content, '#') || starts_with(content, '#')) DESC,
content_orig = ${schema}.slug(content_orig) DESC, num_authors DESC NULLS LAST, num_articles DESC NULLS LAST, span_days DESC NULLS LAST, created_at ASC, id ASC) AS aliases,
ARRAY_AGG(id
ORDER BY is_major_tag DESC NULLS LAST, deleted ASC NULLS LAST, -- false<true<null
-- content NOT SIMILAR TO '(#|#)%' DESC,
starts_with_hash ASC NULLS LAST, -- (starts_with(content, '#') || starts_with(content, '#')) DESC,
content_orig = ${schema}.slug(content_orig) DESC, num_authors DESC NULLS LAST, num_articles DESC NULLS LAST, span_days DESC NULLS LAST, created_at ASC, id ASC) AS dup_tag_ids
FROM (
SELECT t.slug, t.id ::int, is_major_tag, t.content_orig, deleted, starts_with_hash,
concat(t.id, '-', ${schema}.slug(t.content_orig)) AS id_slug_orig,
COUNT(DISTINCT t.id) AS num_ids,
COUNT(DISTINCT article_id) ::int AS num_articles,
COUNT(DISTINCT author_id) ::int AS num_authors,
MAX(at.created_at) ::date - MIN(at.created_at) ::date AS span_days,
MIN(t.created_at) AS created_at
FROM temp_tag t
LEFT JOIN public.article_tag at ON at.tag_id=t.id
LEFT JOIN public.article a ON at.article_id=a.id AND a.state IN ('active') -- NOT IN ('archived', 'banned')
GROUP BY 1, 2, 3, 4, 5, 6, 7
) top
GROUP BY 1
)
SELECT DISTINCT ON (id_slug)
id_slug, id, content, slug, /* same_as_slug, is_major_tag, major_tag_id, */ dup_tag_ids,
num_articles, num_authors, num_articles_r3m, num_authors_r3m, num_articles_r1m, num_authors_r1m, num_articles_r2w, num_authors_r2w, num_articles_r1w, num_authors_r1w, -- sum_read_time_top20_r2w,
span_days,
earliest_use, latest_use, description, created_at, updated_at,
creator, owner, editors, -- merged_editors, deleted, -- cover, -- tag_articles, tag_authors,
top_rels, dups, details, action_details -- , stats_series
FROM (
SELECT COALESCE(aliases[1], (t.id || '-' || t.slug)) AS id_slug,
t.id ::int, t.content_orig AS content, t.slug, t.starts_with_hash, t.same_as_slug, -- t.content_orig = pg_temp.slug(t.content_orig) AS same_as_slug, -- t.slug, -- COALESCE(major_tag_id, t.id) ::int AS major_tag_id,
is_major_tag, is_conforming_tag,
dup_tag_ids[1] ::int AS major_tag_id, dup_tag_ids ::int[],
COALESCE(num_articles, 0) ::int AS num_articles, COALESCE(num_authors, 0) ::int AS num_authors,
num_articles_r3m, num_authors_r3m, num_articles_r1m, num_authors_r1m, num_articles_r2w, num_authors_r2w, num_articles_r1w, num_authors_r1w,
COALESCE(at2.span_days, 0) ::int AS span_days,
earliest_use, latest_use, -- from at2.*
t.description, t.created_at, t.updated_at, t.creator ::int, t.owner ::int, t.editors ::int[] AS editors, t.deleted,
COALESCE(tag_articles, 0) ::int AS tag_articles,
COALESCE(tag_authors, 0) ::int AS tag_authors,
tag_rels.top_rels, -- atat.top_authors,
to_jsonb(ARRAY(
SELECT x FROM jsonb_array_elements(dups) x
ORDER BY x->'deleted' ASC, x->'is_major_tag' DESC, x->'is_conforming_tag' DESC, x->'starts_with_hash' ASC, x->'same_as_slug' DESC,
x->'tag_authors' DESC, x->'tag_articles' DESC, x->'span_days' DESC, (x->>'created_at')::timestamptz ASC, x->'id' ASC
)) AS dups, action_details
FROM temp_tag t
LEFT JOIN temp_article_tag_stats_by_id tag_stats ON tag_stats.tag_id=t.id
LEFT JOIN temp_article_tag_rels_by_id tag_rels ON tag_rels.tag_id=t.id
LEFT JOIN article_tag_stats at2 USING (slug)
-- LEFT JOIN article_tag_stats_by_slug atst USING (slug)
-- LEFT JOIN article_tag_authors_by_slug atat USING (slug)
-- LEFT JOIN tag_read_time_by_slug artst USING (slug)
LEFT JOIN tag_slug_aliases USING (slug)
) t1
ORDER BY id_slug,
deleted ASC NULLS FIRST,
is_major_tag DESC NULLS LAST,
is_conforming_tag DESC NULLS LAST,
starts_with_hash ASC NULLS LAST,
same_as_slug DESC, tag_authors DESC NULLS LAST, tag_articles DESC NULLS LAST, span_days DESC NULLS LAST, created_at ASC
; `)

// add indexes
await knex.schema.alterTable(`${schema}.${materialized_view_name}`, (t) => {
t.primary('id')
.index('dup_tag_ids', { indexType: 'gin' })
// add unique index for refresh concurrently
.unique('id_slug')
.unique('content')
.unique('slug')
// .unique(['id', 'type'], { useConstraint: false })
})
}

exports.down = async (knex) => {
await knex.raw(
/*sql*/ `DROP MATERIALIZED VIEW IF EXISTS ${schema}.${materialized_view_name} CASCADE;`
)
}

0 comments on commit c7817ec

Please sign in to comment.