#!/usr/bin/perl use JSON::XS; #list of scriptio continua languages... lang codes: https://cloud.google.com/translate/docs/languages foreach ('zh', 'zh-CN', 'zh-TW', 'ja', 'my', 'th', 'km', 'lo') { $SCLANG{$_} = 1; }; open(FILE, "./TRANSLATIONS.json"); while() { my $ref; $ref = decode_json $_; $KEYPHRASES{$ref->{'lang'}}{lc($ref->{'trans'})} = 1; $entries++; } close(FILE); $cnt = scalar(keys %KEYPHRASES); print "Found $cnt distinct keyphrases out of $entries entries...\n"; #write the opening... #change eventTime below to FORMAT_TIMESTAMP(\"%Y-%m-%dT%X%Ez\", DATE_TRUNC(date, HOUR), \"UTC\") eventTime to collapse time resolution, but best to record at full resolution and aggregate within the API... open(OUT, ">./QUERY.sql"); binmode(OUT, ":utf8"); print OUT "with data as (\n"; print OUT "select groupId, FORMAT_TIMESTAMP(\"%Y-%m-%dT%X%Ez\", date, \"UTC\") eventTime, STRUCT('topic' as name, 'monkeypox' as stringVal) as dimensions from (\n"; print OUT "SELECT FARM_FINGERPRINT( CONCAT(ngram, post, url) ) groupId, min(date) date FROM `gdelt-bq.gdeltv2.webngrams` WHERE ( DATE(date) >= \"2022-04-01\" AND DATE(date) <= \"2022-05-31\") and (\n"; print OUT "\n"; #write the keyword terms... $BOOLEAN = ''; foreach $lang (keys %KEYPHRASES) { foreach $ngram (keys %{$KEYPHRASES{$lang}}) { $_=~s/\s+/ /; $_=~s/^\s+$//; $_=~s/\s+$//; $_=~s/'/\\'/g; #$BOOLEAN .= "(LOWER(ngram) = \"$_\" OR concat(ngram,post) like \"$_%\" OR concat(ngram, \" \", post) like \"$_%\") OR\n"; if (exists($SCLANG{$lang})) { #scriptio continua language... concat ngram and post and match as-is... $BOOLEAN .= "(lang='$lang' AND REGEXP_CONTAINS(CONCAT(LOWER(ngram), LOWER(post)), r'^$ngram')) OR\n"; } else { #space-segmented language... since ngrams include punctuation, require \b and use space in concat for multi-word phrases... if ($ngram=~/\s/) { $BOOLEAN .= "(lang='$lang' AND REGEXP_CONTAINS(CONCAT(LOWER(ngram), ' ', LOWER(post)), r'^$ngram\\b')) OR\n"; } else { $BOOLEAN .= "(lang='$lang' AND REGEXP_CONTAINS(LOWER(ngram), r'^$ngram\\b')) OR\n"; } } } } $BOOLEAN=~s/OR\s+$//; #write the footer and close... print OUT $BOOLEAN; print OUT "\n\n"; print OUT ") group by groupId )\n"; print OUT ") select eventTime, groupId, ARRAY_AGG(dimensions) AS dimensions FROM data GROUP BY eventTime, groupId\n"; close(OUT);