#!/usr/bin/perl

use JSON::XS;

#list of scriptio continua languages... lang codes: https://cloud.google.com/translate/docs/languages
foreach ('zh', 'zh-CN', 'zh-TW', 'ja', 'my', 'th', 'km', 'lo') { $SCLANG{$_} = 1; };

open(FILE, "./TRANSLATIONS.json");
while(<FILE>) {
    my $ref; $ref = decode_json $_;
    $KEYPHRASES{$ref->{'lang'}}{lc($ref->{'trans'})} = 1;
    $entries++;
}
close(FILE);
$cnt = scalar(keys %KEYPHRASES); print "Found $cnt distinct keyphrases out of $entries entries...\n";

#write the opening...
#change eventTime below to FORMAT_TIMESTAMP(\"%Y-%m-%dT%X%Ez\", DATE_TRUNC(date, HOUR), \"UTC\") eventTime to collapse time resolution, but best to record at full resolution and aggregate within the API...
open(OUT, ">./QUERY.sql"); binmode(OUT, ":utf8");
print OUT "with data as (\n";
print OUT "select groupId, FORMAT_TIMESTAMP(\"%Y-%m-%dT%X%Ez\", date, \"UTC\") eventTime, STRUCT('topic' as name, 'monkeypox' as stringVal) as dimensions from (\n";
print OUT "SELECT FARM_FINGERPRINT( CONCAT(ngram, post, url) ) groupId, min(date) date FROM `gdelt-bq.gdeltv2.webngrams` WHERE ( DATE(date) >= \"2022-04-01\" AND DATE(date) <= \"2022-05-31\") and (\n";
print OUT "\n";

#write the keyword terms...
$BOOLEAN = '';
foreach $lang (keys %KEYPHRASES) {
    foreach $ngram (keys %{$KEYPHRASES{$lang}}) {
	$_=~s/\s+/ /; $_=~s/^\s+$//; $_=~s/\s+$//; $_=~s/'/\\'/g;
	#$BOOLEAN .= "(LOWER(ngram) = \"$_\" OR concat(ngram,post) like \"$_%\" OR concat(ngram, \" \", post) like \"$_%\") OR\n";
	
	if (exists($SCLANG{$lang})) { 
	    #scriptio continua language... concat ngram and post and match as-is...
	    $BOOLEAN .= "(lang='$lang' AND REGEXP_CONTAINS(CONCAT(LOWER(ngram), LOWER(post)), r'^$ngram')) OR\n";
	} else {
	    #space-segmented language... since ngrams include punctuation, require \b and use space in concat for multi-word phrases... 
	    if ($ngram=~/\s/) {
	    	$BOOLEAN .= "(lang='$lang' AND REGEXP_CONTAINS(CONCAT(LOWER(ngram), ' ', LOWER(post)), r'^$ngram\\b')) OR\n";
	    } else {
	    	$BOOLEAN .= "(lang='$lang' AND REGEXP_CONTAINS(LOWER(ngram), r'^$ngram\\b')) OR\n";
	    }
	}

   }
}
$BOOLEAN=~s/OR\s+$//;

#write the footer and close...
print OUT $BOOLEAN;
print OUT "\n\n";
print OUT ") group by groupId )\n";
print OUT ") select eventTime, groupId, ARRAY_AGG(dimensions) AS dimensions FROM data GROUP BY eventTime, groupId\n";

close(OUT);
