Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(tags-view-materialized): move mat_views.tags_view into db/migrat… #3914

Merged
merged 1 commit into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 221 additions & 0 deletions db/migrations/20240515080100_create_mat_views_tags_view.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// coverted from
// https://github.com/thematters/matters-metabase/blob/main/sql/stale-tags-create-table-view.sql
tx0c marked this conversation as resolved.
Show resolved Hide resolved
// from AnalysisDB's regular daily view format to materialized view;

const schema = 'mat_views'
const materialized_view_name = 'tags_lasts_view_materialized'

exports.up = async (knex) => {
await knex.raw(/*sql*/ `
CREATE SCHEMA IF NOT EXISTS "${schema}" ;

CREATE OR REPLACE FUNCTION "${schema}".slug(input text) RETURNS text AS $f$
SELECT COALESCE(
NULLIF(trim(both '-' from regexp_replace(input, '\\W+', '-', 'g')), ''),
-- NULLIF(trim(both '-' from regexp_replace(input, '[()()@''’"<>,.?;&&!│||/##、,.…:;「」《》?!\\\\+—\\-ㅤ\\s]+', '-', 'g')), ''),
NULLIF(trim(both '-' from regexp_replace(input, '\\s+', '-', 'g')), ''),
input
)
$f$ LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT;

CREATE OR REPLACE FUNCTION "${schema}".is_conforming_tag(input text) RETURNS boolean AS $f$
SELECT length(input)<=40 AND input ~ '^\\w+$' -- regular expression match
$f$ LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT;


DROP MATERIALIZED VIEW IF EXISTS "${schema}"."${materialized_view_name}" CASCADE;

CREATE MATERIALIZED VIEW "${schema}"."${materialized_view_name}" AS

WITH temp_tag AS (
SELECT id, created_at, updated_at, remark, deleted, starts_with_hash, same_as_slug, cover, description, editors, creator, owner, major_tag_id, is_major_tag, is_conforming_tag,
content_orig, COALESCE(slug, "${schema}".slug(lower(content))) AS slug
FROM (
SELECT id ::int, created_at, updated_at, remark, deleted, cover, description, editors ::int[], creator, owner, id ::int AS major_tag_id,
/* (major_tag_id IS NULL OR major_tag_id=id) */ NULL AS is_major_tag, NULL AS slug,
"${schema}".is_conforming_tag(content),
content AS content_orig,
(starts_with(content, '#') OR starts_with(content, '#')) AS starts_with_hash,
(content = "${schema}".slug(content)) AS same_as_slug,
content -- use t2.content alias
FROM public.tag t1
) t1
), temp_article_tag_stats_by_id AS (
SELECT tag_id ::int,
COUNT(DISTINCT article_id) ::int AS tag_articles, COUNT(DISTINCT author_id) ::int AS tag_authors,
(MAX(at.created_at) + '1 day'::interval - '1 microsecond'::interval) ::date - MIN(at.created_at) ::date AS span_days
FROM public.article_tag at JOIN public.article a ON article_id=a.id AND a.state IN ('active')
GROUP BY 1
), temp_tag_uses AS (
SELECT tag_id ::int, ARRAY_AGG(DISTINCT article_id) AS article_ids, COUNT(DISTINCT article_id) ::int
-- FROM article_tag
FROM public.article_tag at JOIN public.article a ON article_id=a.id AND a.state IN ('active')
GROUP BY 1
HAVING COUNT(article_id)>0
), temp_tag_common_count AS (
SELECT t.tag_id, tag_rel_id, count_common,
tu1.count AS count_target, tu2.count AS count_rel,
(tu1.count + tu2.count - count_common) AS count_union, temp_tag.is_conforming_tag
FROM (
SELECT tu.tag_id ::int, at.tag_id ::int AS tag_rel_id, COUNT(*) ::int AS count_common
FROM temp_tag_uses tu
JOIN article_tag at ON at.article_id =ANY(tu.article_ids) AND (at.tag_id <> tu.tag_id)
GROUP BY 1, 2
) t
JOIN temp_tag_uses tu1 ON tu1.tag_id=t.tag_id
JOIN temp_tag_uses tu2 ON tu2.tag_id=t.tag_rel_id
LEFT JOIN temp_tag ON tu2.tag_id=temp_tag.id -- WHERE tag.is_conforming_tag
), temp_tag_similarity AS (
SELECT *, -- (count1+count2-count_common) AS count_union,
ROUND(count_common ::numeric / count_union, 5) AS similarity,
ROUND((count_common ::numeric / count_target + count_common ::numeric / count_rel)/2, 5) AS diff_similarity,
-- ROUND(count_common ::numeric / count_union * count_rel, 5) AS sim_to_target,
rank() OVER(PARTITION BY tag_id ORDER BY
(count_common ::numeric * count_rel / count_union) DESC, -- count_rel * similarity DESC
(count_common ::numeric / count_union) DESC, count_common DESC) -- FILTER (WHERE is_conforming_tag)
FROM temp_tag_common_count t
), temp_article_tag_rels_by_id AS (
SELECT tag_id, to_jsonb((ARRAY_AGG(
(to_jsonb(ts.*) - '{rank,tag_id,count_target,is_conforming_tag}'::text[] -- || jsonb_build_object('tag_content', temp_tag.content)
) ORDER BY rank ASC) FILTER (WHERE is_conforming_tag)
)[1:100]) AS top_rels
FROM temp_tag_similarity ts
-- JOIN temp_tag ON ts.tag_rel_id=temp_tag.id
WHERE ts.is_conforming_tag AND
rank<=130 -- extract only first 100 from
GROUP BY 1
), action_tag_stats AS (
SELECT target_id AS tag_id, action, COUNT(*) ::int AS count,
COUNT(DISTINCT user_id) ::int AS num_users,
MIN(created_at) AS earliest,
MAX(created_at) AS latest,
-- MAX(created_at) ::date - MIN(created_at) ::date AS span_days
(MAX(created_at) + '1 day'::interval - '1 microsecond'::interval) ::date - MIN(created_at) ::date AS span_days
FROM public.action_tag
GROUP BY 1, 2
), article_tag_stats AS (
SELECT t.slug, -- COUNT(*) ::int AS count,
COUNT(DISTINCT at.article_id) FILTER(WHERE a.state IN ('active')) ::int AS num_articles,
COUNT(DISTINCT a.author_id) FILTER(WHERE a.state IN ('active')) ::int AS num_authors,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '3 months'::interval AND a.state IN ('active')) ::int AS num_articles_r3m,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '3 months'::interval AND a.state IN ('active')) ::int AS num_authors_r3m,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '1 month'::interval AND a.state IN ('active')) ::int AS num_articles_r1m,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '1 month'::interval AND a.state IN ('active')) ::int AS num_authors_r1m,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '2 weeks'::interval AND a.state IN ('active')) ::int AS num_articles_r2w,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '2 weeks'::interval AND a.state IN ('active')) ::int AS num_authors_r2w,
COUNT(DISTINCT at.article_id) FILTER(WHERE age(at.created_at) <= '1 week'::interval AND a.state IN ('active')) ::int AS num_articles_r1w,
COUNT(DISTINCT a.author_id) FILTER(WHERE age(at.created_at) <= '1 week'::interval AND a.state IN ('active')) ::int AS num_authors_r1w,
(MAX(at.created_at) + '1 day'::interval - '1 microsecond'::interval) ::date - MIN(at.created_at) ::date AS span_days,
MIN(at.created_at) AS earliest_use,
MAX(at.created_at) AS latest_use,
to_jsonb(ARRAY_AGG(DISTINCT jsonb_build_object(
'id', t.id, 'tag', t.content_orig, 'id_slug', (t.id || '-' || t.slug),
'tag_articles', tag_articles, 'tag_authors', tag_authors,
'same_as_slug', t.same_as_slug, -- t.content_orig = pg_temp.slug(t.content_orig),
'starts_with_hash', t.starts_with_hash, -- (starts_with(content, '#') OR starts_with(content, '#')),
-- 'is_major_tag', t.is_major_tag, -- t.major_tag_id=t.id,
'is_conforming_tag', t.is_conforming_tag,
'span_days', at.span_days, 'created_at', t.created_at,
'deleted', t.deleted,
'url', ('/tags/' || rtrim(encode(('Tag:' || t.id) ::bytea, 'base64'), '='))
) -- ORDER BY tag_articles DESC
)) AS dups,
-- to_jsonb(ARRAY_AGG(DISTINCT to_jsonb(t.*) || jsonb_build_object('editors', t.editors ::int[]))) AS details,
to_jsonb(ARRAY_AGG(DISTINCT to_jsonb(action_tag_stats.*)) FILTER (WHERE action_tag_stats.count IS NOT NULL) ) AS action_details
FROM temp_tag t
LEFT JOIN (
SELECT *
FROM public.article_tag at
JOIN temp_article_tag_stats_by_id USING (tag_id)
) at ON t.id=at.tag_id
LEFT JOIN action_tag_stats USING (tag_id)
LEFT JOIN public.article a ON at.article_id=a.id AND a.state IN ('active')
GROUP BY 1
), tag_slug_aliases AS (
SELECT slug,
ARRAY_AGG(id_slug_orig
ORDER BY is_major_tag DESC NULLS LAST, deleted ASC NULLS LAST, -- false<true<null
-- content NOT SIMILAR TO '(#|#)%' DESC,
starts_with_hash ASC NULLS LAST, -- (starts_with(content, '#') || starts_with(content, '#')) DESC,
content_orig = "${schema}".slug(content_orig) DESC, num_authors DESC NULLS LAST, num_articles DESC NULLS LAST, span_days DESC NULLS LAST, created_at ASC, id ASC) AS aliases,
ARRAY_AGG(id
ORDER BY is_major_tag DESC NULLS LAST, deleted ASC NULLS LAST, -- false<true<null
-- content NOT SIMILAR TO '(#|#)%' DESC,
starts_with_hash ASC NULLS LAST, -- (starts_with(content, '#') || starts_with(content, '#')) DESC,
content_orig = "${schema}".slug(content_orig) DESC, num_authors DESC NULLS LAST, num_articles DESC NULLS LAST, span_days DESC NULLS LAST, created_at ASC, id ASC) AS dup_tag_ids
FROM (
SELECT t.slug, t.id ::int, is_major_tag, t.content_orig, deleted, starts_with_hash,
concat(t.id, '-', "${schema}".slug(t.content_orig)) AS id_slug_orig,
COUNT(DISTINCT t.id) AS num_ids,
COUNT(DISTINCT article_id) ::int AS num_articles,
COUNT(DISTINCT author_id) ::int AS num_authors,
MAX(at.created_at) ::date - MIN(at.created_at) ::date AS span_days,
MIN(t.created_at) AS created_at
FROM temp_tag t
LEFT JOIN public.article_tag at ON at.tag_id=t.id
LEFT JOIN public.article a ON at.article_id=a.id AND a.state IN ('active') -- NOT IN ('archived', 'banned')
GROUP BY 1, 2, 3, 4, 5, 6, 7
) top
GROUP BY 1
)

SELECT DISTINCT ON (id_slug)
id_slug, id, content, slug, /* same_as_slug, is_major_tag, major_tag_id, */ dup_tag_ids,
num_articles, num_authors, num_articles_r3m, num_authors_r3m, num_articles_r1m, num_authors_r1m, num_articles_r2w, num_authors_r2w, num_articles_r1w, num_authors_r1w, -- sum_read_time_top20_r2w,
span_days,
earliest_use, latest_use, description, created_at, updated_at,
creator, owner, editors, -- merged_editors, deleted, -- cover, -- tag_articles, tag_authors,
top_rels, dups, action_details -- , stats_series
FROM (
SELECT COALESCE(aliases[1], (t.id || '-' || t.slug)) AS id_slug,
t.id ::int, t.content_orig AS content, t.slug, t.starts_with_hash, t.same_as_slug, -- t.content_orig = pg_temp.slug(t.content_orig) AS same_as_slug, -- t.slug, -- COALESCE(major_tag_id, t.id) ::int AS major_tag_id,
is_major_tag, is_conforming_tag,
dup_tag_ids[1] ::int AS major_tag_id, dup_tag_ids ::int[],
COALESCE(num_articles, 0) ::int AS num_articles, COALESCE(num_authors, 0) ::int AS num_authors,
num_articles_r3m, num_authors_r3m, num_articles_r1m, num_authors_r1m, num_articles_r2w, num_authors_r2w, num_articles_r1w, num_authors_r1w,
COALESCE(at2.span_days, 0) ::int AS span_days,
earliest_use, latest_use, -- from at2.*
t.description, t.created_at, t.updated_at, t.creator ::int, t.owner ::int, t.editors ::int[] AS editors, t.deleted,
COALESCE(tag_articles, 0) ::int AS tag_articles,
COALESCE(tag_authors, 0) ::int AS tag_authors,

tag_rels.top_rels, -- atat.top_authors,

to_jsonb(ARRAY(
SELECT x FROM jsonb_array_elements(dups) x
ORDER BY x->'deleted' ASC, x->'is_major_tag' DESC, x->'is_conforming_tag' DESC, x->'starts_with_hash' ASC, x->'same_as_slug' DESC,
x->'tag_authors' DESC, x->'tag_articles' DESC, x->'span_days' DESC, (x->>'created_at')::timestamptz ASC, x->'id' ASC
)) AS dups, action_details
FROM temp_tag t
LEFT JOIN temp_article_tag_stats_by_id tag_stats ON tag_stats.tag_id=t.id
LEFT JOIN temp_article_tag_rels_by_id tag_rels ON tag_rels.tag_id=t.id
LEFT JOIN article_tag_stats at2 USING (slug)
-- LEFT JOIN article_tag_stats_by_slug atst USING (slug)
-- LEFT JOIN article_tag_authors_by_slug atat USING (slug)
-- LEFT JOIN tag_read_time_by_slug artst USING (slug)
LEFT JOIN tag_slug_aliases USING (slug)
) t1
ORDER BY id_slug,
deleted ASC NULLS FIRST,
is_major_tag DESC NULLS LAST,
is_conforming_tag DESC NULLS LAST,
starts_with_hash ASC NULLS LAST,
same_as_slug DESC, tag_authors DESC NULLS LAST, tag_articles DESC NULLS LAST, span_days DESC NULLS LAST, created_at ASC
;

CREATE UNIQUE INDEX ON "${schema}"."${materialized_view_name}" (id) ;
CREATE UNIQUE INDEX ON "${schema}"."${materialized_view_name}" (id_slug) ;
CREATE UNIQUE INDEX ON "${schema}"."${materialized_view_name}" (content) ;
CREATE UNIQUE INDEX ON "${schema}"."${materialized_view_name}" (slug) ;

DROP INDEX IF EXISTS "${schema}"."${materialized_view_name}_dup_tag_ids_idx" ;
CREATE INDEX "${materialized_view_name}_dup_tag_ids_idx" ON "${schema}"."${materialized_view_name}" USING gin(dup_tag_ids) ;

`)
}

exports.down = async (knex) => {
await knex.raw(

Check warning on line 218 in db/migrations/20240515080100_create_mat_views_tags_view.js

View check run for this annotation

Codecov / codecov/patch

db/migrations/20240515080100_create_mat_views_tags_view.js#L218

Added line #L218 was not covered by tests
/*sql*/ `DROP MATERIALIZED VIEW IF EXISTS "${schema}"."${materialized_view_name}" CASCADE;`
)
}
1 change: 1 addition & 0 deletions src/common/enums/table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export enum VIEW {

export enum MATERIALIZED_VIEW {
tag_count_materialized = 'tag_count_materialized',
tags_lasts_view_materialized = 'mat_views.tags_lasts_view_materialized',
user_reader_materialized = 'user_reader_materialized',
featured_comment_materialized = 'featured_comment_materialized',
curation_tag_materialized = 'curation_tag_materialized',
Expand Down
12 changes: 6 additions & 6 deletions src/connectors/tagService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DEFAULT_TAKE_PER_PAGE,
TAG_ACTION,
VIEW,
MATERIALIZED_VIEW,
} from 'common/enums'
import { environment } from 'common/environment'
import { TooManyTagsForArticleError, ForbiddenError } from 'common/errors'
Expand Down Expand Up @@ -698,7 +699,7 @@
}): Promise<Array<{ id: string }>> =>
this.knex
.select('id')
.from(VIEW.tags_lasts_view)
.from(MATERIALIZED_VIEW.tags_lasts_view_materialized)
.modify(function (this: Knex.QueryBuilder) {
if (minAuthors) {
this.where('num_authors', '>=', minAuthors)
Expand Down Expand Up @@ -852,7 +853,7 @@

let result: any
try {
result = await this.knex(VIEW.tags_lasts_view)
result = await this.knex(MATERIALIZED_VIEW.tags_lasts_view_materialized)

Check warning on line 856 in src/connectors/tagService.ts

View check run for this annotation

Codecov / codecov/patch

src/connectors/tagService.ts#L856

Added line #L856 was not covered by tests
.select('id', 'content', 'id_slug', 'num_authors', 'num_articles')
.where(function (this: Knex.QueryBuilder) {
this.where('id', '=', tagId)
Expand Down Expand Up @@ -898,7 +899,7 @@

let result: any
try {
result = await this.knexRO(VIEW.tags_lasts_view)
result = await this.knexRO(MATERIALIZED_VIEW.tags_lasts_view_materialized)

Check warning on line 902 in src/connectors/tagService.ts

View check run for this annotation

Codecov / codecov/patch

src/connectors/tagService.ts#L902

Added line #L902 was not covered by tests
.select('id', 'content', 'id_slug', 'num_authors', 'num_articles')
.where(function (this: Knex.QueryBuilder) {
this.where('tag_id', tagId)
Expand Down Expand Up @@ -964,8 +965,7 @@
builder.orWhereIn(
'tag_id',
this.knex
.from(VIEW.tags_lasts_view)
// .joinRaw('CROSS JOIN unnest(dup_tag_ids) AS x(id)')
.from(MATERIALIZED_VIEW.tags_lasts_view_materialized)
.whereRaw('dup_tag_ids @> ARRAY[?] ::int[]', tagId)
.select(this.knex.raw('UNNEST(dup_tag_ids)'))
)
Expand Down Expand Up @@ -1149,7 +1149,7 @@
*/
public findRelatedTags = async ({ id }: { id: string; content?: string }) =>
this.knex
.from(VIEW.tags_lasts_view)
.from(MATERIALIZED_VIEW.tags_lasts_view_materialized)
.joinRaw(
'CROSS JOIN jsonb_to_recordset(top_rels) AS x(tag_rel_id int, count_rel int, count_common int, similarity float)'
)
Expand Down
Loading