Generating title keyword n-grams

To regenerate the keywords in this list (gf-app>titleKeywords.ts):
Run the query below and use the result to refresh the array of inclusionKeywords:
-- Step 1: Clean titles, remove punctuation, lowercase
WITH base AS (
  SELECT
    row_number() OVER () AS id,
    LOWER(REGEXP_REPLACE(position, '[^a-z0-9 ]', '')) AS title
  FROM gf_direct.full_employment_latest
  WHERE position IS NOT NULL
),
numbers AS (
  -- Step 2: Generate 1 to 20 positions for token splitting
  SELECT ROW_NUMBER() OVER () AS n
  FROM (SELECT NULL FROM gf_direct.full_employment_latest LIMIT 20) x
),
tokens AS (
  -- Step 3: Tokenize titles by word position
  SELECT
    b.id,
    n.n AS pos,
    SPLIT_PART(b.title, ' ', n.n::INT) AS token
  FROM base b
  CROSS JOIN numbers n
),
clean_tokens AS (
  -- Step 4: Remove blank tokens, stopwords, and seniority terms
  SELECT id, pos, TRIM(token) AS token
  FROM tokens
  WHERE TRIM(token) <> ''
    AND TRIM(token) NOT IN (
      -- Stopwords (English + Multilingual)
      'and','or','but','if','then','with','without','on','at','by','to','for','in','of','the',
      'a','an','from','as','is','are','was','were',
      'de','la','le','el','et','en','del','y','da','das','do','dos','di','il','gli','los','las',
      'sur','pour','au','avec','una','un','um','une',

      -- Seniority and title-level words
      'senior','sr','junior','jr','mid','lead','leader','manager','mgr','director','executive',
      'vp','president','owner','partner','head','principal','intern','trainee','entry','staff',
      'assistant','associate','specialist','officer','consultant','advisor','apprentice',
      'freelancer','representative'
    )
),
one_grams AS (
  -- Step 5: 1-gram frequencies
  SELECT token, COUNT(*) AS raw_frequency, 1 AS weight
  FROM clean_tokens
  GROUP BY token
),
two_grams AS (
  -- Step 6: 2-gram frequencies
  SELECT
    t1.token || ' ' || t2.token AS token,
    COUNT(*) AS raw_frequency,
    2 AS weight
  FROM clean_tokens t1
  JOIN clean_tokens t2
    ON t1.id = t2.id AND t1.pos + 1 = t2.pos
  GROUP BY 1
),
three_grams AS (
  -- Step 7: 3-gram frequencies
  SELECT
    t1.token || ' ' || t2.token || ' ' || t3.token AS token,
    COUNT(*) AS raw_frequency,
    3 AS weight
  FROM clean_tokens t1
  JOIN clean_tokens t2 ON t1.id = t2.id AND t1.pos + 1 = t2.pos
  JOIN clean_tokens t3 ON t1.id = t3.id AND t1.pos + 2 = t3.pos
  GROUP BY 1
),
combined AS (
  -- Step 8: Combine all n-grams and apply weighted score
  SELECT '1-gram' AS ngram_type, token, raw_frequency, raw_frequency * weight AS weighted_score FROM one_grams
  UNION ALL
  SELECT '2-gram', token, raw_frequency, raw_frequency * weight FROM two_grams
  UNION ALL
  SELECT '3-gram', token, raw_frequency, raw_frequency * weight FROM three_grams
)
-- Step 9: Output top 10,000 n-grams by weighted score (favoring longer/rarer phrases)
SELECT *
FROM combined
ORDER BY weighted_score DESC
LIMIT 10000;