Generating title keyword n-grams
To regenerate the keywords in this list (gf-app-tng>titleKeywords.ts):
Run the query below and use the result to refresh the array of inclusionKeywords:
-- Step 1: Clean titles, remove punctuation, lowercase
WITH base AS (
SELECT
row_number() OVER () AS id,
LOWER(REGEXP_REPLACE(position, '[^a-z0-9 ]', '')) AS title
FROM gf_direct.full_employment_latest
WHERE position IS NOT NULL
),
numbers AS (
-- Step 2: Generate 1 to 20 positions for token splitting
SELECT ROW_NUMBER() OVER () AS n
FROM (SELECT NULL FROM gf_direct.full_employment_latest LIMIT 20) x
),
tokens AS (
-- Step 3: Tokenize titles by word position
SELECT
b.id,
n.n AS pos,
SPLIT_PART(b.title, ' ', n.n::INT) AS token
FROM base b
CROSS JOIN numbers n
),
clean_tokens AS (
-- Step 4: Remove blank tokens, stopwords, and seniority terms
SELECT id, pos, TRIM(token) AS token
FROM tokens
WHERE TRIM(token) <> ''
AND TRIM(token) NOT IN (
-- Stopwords (English + Multilingual)
'and','or','but','if','then','with','without','on','at','by','to','for','in','of','the',
'a','an','from','as','is','are','was','were',
'de','la','le','el','et','en','del','y','da','das','do','dos','di','il','gli','los','las',
'sur','pour','au','avec','una','un','um','une',
-- Seniority and title-level words
'senior','sr','junior','jr','mid','lead','leader','manager','mgr','director','executive',
'vp','president','owner','partner','head','principal','intern','trainee','entry','staff',
'assistant','associate','specialist','officer','consultant','advisor','apprentice',
'freelancer','representative'
)
),
one_grams AS (
-- Step 5: 1-gram frequencies
SELECT token, COUNT(*) AS raw_frequency, 1 AS weight
FROM clean_tokens
GROUP BY token
),
two_grams AS (
-- Step 6: 2-gram frequencies
SELECT
t1.token || ' ' || t2.token AS token,
COUNT(*) AS raw_frequency,
2 AS weight
FROM clean_tokens t1
JOIN clean_tokens t2
ON t1.id = t2.id AND t1.pos + 1 = t2.pos
GROUP BY 1
),
three_grams AS (
-- Step 7: 3-gram frequencies
SELECT
t1.token || ' ' || t2.token || ' ' || t3.token AS token,
COUNT(*) AS raw_frequency,
3 AS weight
FROM clean_tokens t1
JOIN clean_tokens t2 ON t1.id = t2.id AND t1.pos + 1 = t2.pos
JOIN clean_tokens t3 ON t1.id = t3.id AND t1.pos + 2 = t3.pos
GROUP BY 1
),
combined AS (
-- Step 8: Combine all n-grams and apply weighted score
SELECT '1-gram' AS ngram_type, token, raw_frequency, raw_frequency * weight AS weighted_score FROM one_grams
UNION ALL
SELECT '2-gram', token, raw_frequency, raw_frequency * weight FROM two_grams
UNION ALL
SELECT '3-gram', token, raw_frequency, raw_frequency * weight FROM three_grams
)
-- Step 9: Output top 10,000 n-grams by weighted score (favoring longer/rarer phrases)
SELECT *
FROM combined
ORDER BY weighted_score DESC
LIMIT 10000;