STRING_LIST |
/**
* Convert all my_list elements to strings.
*
* Examples:
* STRING_LIST([0.2,-1.3,5]) = ['0.2','-1.3','5']
*
* @param my_list A list.
* @return my_string_list my_list converted to strings.
*/
function string_list(my_list) {
MAP(my_list, (x) -> TO_STRING(x))
} |
LIST_INSERT |
/**
Insert insert_list into my_list at index, pushing subsequent elements down
by the number of elements of insert_list.
Examples:
LIST_INSERT(RANGE(-3,3), RANGE(99,102), 3) = [-3,-2,-1,99,100,101,0,1,2]
@param my_list A list.
@param insert_list List to be inserted into my_list.
@param index Index at which insert_list will be inserted.
@return my_longer_list my_list with insert_list inserted at index.
*/
function list_insert(my_list, insert_list, index) {
list_size = SIZE(my_list);
insert_size = SIZE(insert_list);
indices = RANGE(0, list_size + insert_size);
MAP(indices,
(i) -> IF(i < index,
my_list[i],
IF(i < index + insert_size,
insert_list[i - index],
my_list[i - insert_size])))
} |
LIST_UNION |
/**
Union list1 and list2 by appending list2 to list1.
Identical to LIST_APPEND().
Examples:
LIST_UNION(RANGE(-3,3), RANGE(99,102)) = [-3,-2,-1,0,1,2,99,100,101]
@param list1 A list.
@param list2 List to be appended to list1.
@return my_longer_list list1 with list2 appended.
*/
function list_union(list1, list2) {
LIST_INSERT(list1, list2, SIZE(list1))
} |
LIST_CONCAT_WS |
/**
Concatenate the elements of my_list using delimiter.
Examples:
LIST_CONCAT_WS(‘,’,[3,2,1,11,-1,-20]) = ‘3,2,1,11,-1,-20’ // CSV numeric list.
@param delimiter String.
@param my_list List to be concatenated using delimiter.
@return my_list_concat_ws String with the elements of my_list concatenated, separated by delimiter.
*/
function list_concat_ws(delimiter, my_list) {
REDUCE(MAP(my_list,
(x) -> TO_STRING(x)),
(x,y) -> CONCAT_WS(delimiter, x, y))
} |
LISTS_TO_MAP |
/**
Turn two lists of equal-length keys and values into JSON then into a map.
Examples:
LISTS_TO_MAP([‘v’],[1]) = {‘v’:1}
LISTS_TO_MAP([‘a’,’b’,’c’],[1,2,3]) = {‘a’:1, ‘b’:2, ‘c’:3}
LISTS_TO_MAP([-2.0, 10, 40], [0,0,0]) = {‘-2’:0, ‘10’:0, ‘40’:0}
@param keys A list of keys.
@param values A list of values.
@return my_map keys and values combined and converted into a map.
*/
function lists_to_map(keys, values) {
idx = RANGE(0,SIZE(keys));
output = MAP(idx, (i) -> CONCAT(‘”’, TO_STRING(keys[i]), ‘”:’, JSON(values[i])));
JSON_TO_MAP(CONCAT(‘{’, LIST_CONCAT_WS(‘,’,output), ‘}’))
} |
MAP_KEYS |
/**
Retrieve a list of the keys of my_map.
Examples:
MAP_KEYS(JSON_TO_MAP(‘{“key1”:”value1”,”key2”:”value2”}’)) = [‘key1’,’key2’]
MAP_KEYS({‘foo’: 1, ‘bar’: 2}) = [‘bar’, ‘foo’]
MAP_KEYS({‘’: 1, ‘bar’: 2}) = [‘’, ‘bar’]
MAP_KEYS({‘’: 1, ‘’: 2}) = [‘’]
@param my_map A map, e.g., {cat=1, num=2}.
@return keys List of keys, e.g., [‘cat’,’num’].
*/
function map_keys(my_map) {
MAP_TO_LIST(my_map, (k,v) -> k)
} |
RANGE_STEP |
/**
Return a list of numbers from start to end in steps of step.
Examples:
RANGE_STEP(-2,10,2) = [-2,0,2,4,6,8]
RANGE_STEP(10,-2,-2) = [10,8,6,4,2,0]
start=MILLIS(TO_DATETIME(‘2020-01-01 00:00:00.000’));
end=MILLIS(TO_DATETIME('2020-01-06 00:00:00.001'));
step=86400000; // 1 day in milliseconds
RANGE_STEP(start, end, step)
= [1577836800000, 1577923200000, 1578009600000, 1578096000000, 1578182400000, 1578268800000]
@param start Numeric scalar, e.g., -2.
@param end Numeric scalar, e.g., 10.
@param step Numeric scalar, e.g., 2. Nonzero allowed.
@return step_list Numeric list, e.g., [-2,0,2,4,6,8].
*/
function range_step(start, end, step) {
num_steps = (end - start) / step;
steps = RANGE(0,CEIL(num_steps));
step_list = MAP(steps,
(x) -> x * step + start);
step_list
} |
TIMESTAMP_RANGE |
/**
Return a list of integer millisecond timestamps from start to
end in steps of step.
Examples:
start=’2020-01-01 00:00:00.000’;
end='2020-01-06 00:00:00.001';
step=86400000; // 1 day in milliseconds
TIMESTAMP_RANGE(start, end, step)
= [1577836800000, 1577923200000, 1578009600000, 1578096000000, 1578182400000, 1578268800000]
TIMESTAMP_RANGE(‘2020-05-01’,’2020-05-06’,86400*1000)
= [1588291200000, 1588377600000, 1588464000000, 1588550400000, 1588636800000]
@param start Datetime-parseable string, e.g., ‘2020-01-01 00:00:00.000’.
@param end Datetime-parseable string, e.g., ‘2020-02-02 00:00:00.001’.
@param step Numeric scalar in milliseconds.
@return step_list Millisecond timestamp list, e.g., [1588291200000, 1588377600000, …].
*/
function timestamp_range(start, end, step) {
start = MILLIS(TO_DATETIME(start));
end = MILLIS(TO_DATETIME(end));
RANGE_STEP(start, end, step)
} |
PERIODIC_BIN |
/**
Round the remainder of x into periodic bins (e.g., % 7, 24, 365 for weekly, hourly,
or annual use cases). Bins start at 0,1,2,….
Scale the periodic remainder of x % period integers num_bins bins. To
preserve or adjust the scale/units of period, do pre-/post-adjustments: e.g.,
scale * PERIODIC_BIN() + location.
Examples:
// Bin a number in (0,30) days.
PERIODIC_BIN(30 * 0.8, 1, 24) = 0 // Numeric hour of day.
PERIODIC_BIN(30 * 0.8, 7, 7) = 3 // Numeric day of week.
PERIODIC_BIN(30 * 0.8, 1, 4) = 0 // Four 6-hour blocks during each day.
@param x Scalar.
@param period Scalar.
@param num_bins Scalar.
@return x_bin The periodic integer bin that x falls into.
*/
function periodic_bin(x, period, num_bins) {
FLOOR(num_bins * (x % period)/period )
} |
BETWEEN |
/**
If start <= x < end, return 1. Otherwise return 0.
Examples:
start = 700; end = -700; // Days
reference_time = MILLIS(TO_DATETIME('2022-01-01 9:00:00.000'));
// Replace this expression...
IF(reference_time - end*24*3600*1000 <= timestamp
AND timestamp < reference_time - start*24*3600*1000, 1, 0)
-
// With this more readable expression using the UDF.
BETWEEN(reference_time - timestamp,
-
-
BETWEEN(1, 0, 0) = 0
BETWEEN(0, 0, 0) = 0
BETWEEN(0, 0, 1) = 1
BETWEEN(0, 1, -1) = 0
BETWEEN(0, -1, 1) = 1
BETWEEN(‘b’, ‘a’, ‘c’) = 1
BETWEEN(‘a’, ‘b’, ‘c’) = 0
BETWEEN(NOW(), 0, 1) = 0
BETWEEN(NOW(), 0, null) = 0
BETWEEN(NOW(), 0, 2000000000000) = 1
BETWEEN(NOW(), null, 2000000000000) = 1
BETWEEN(NOW(), ‘’, ‘2000000000000’) = 1
BETWEEN(NOW(), 0, null) = 0
@param x Scalar.
@param start Scalar.
@param end Scalar.
@return is_x_between IF(start <= x < end, 1, 0).
*/
function between(x, start, end) {
IF(start <= x AND x < end, 1, 0)
} |
HISTOGRAM_BIN |
/**
Count the number of elements of my_list in [lower_edge, upper_edge).
Examples:
HISTOGRAM_BIN([1,1,2,4],1,2) = 2
HISTOGRAM_BIN([1,1,2,4],1,2.01) = 3
HISTOGRAM_BIN([1,1,2,4],2,4) = 1
@param my_list A numeric list.
@param lower_edge Scalar.
@param upper_edge Scalar.
@return count Number of elements of my_list in [lower_edge, upper_edge).
*/
function histogram_bin(my_list, lower_edge, upper_edge) {
count = SUM(MAP(my_list,
(x) -> IF(BETWEEN(x, lower_edge, upper_edge), 1, 0)));
count
} |
RANK |
/**
Retrieve the value of rank r from my_list. Rank is
determined by top as descending (TRUE) or ascending (FALSE).
Examples:
RANK(RANGE(90,101), 3, TRUE) = 98 // (3rd in […98,99,100])
// (3rd order statistic of 100 draws from random uniform.
RANK(RANDOM_LIST(100), 3, TRUE) > 0.8
@param my_list A list.
@param r Rank to retrieve in sorted list.
@param top Descending (TRUE) or ascending (FALSE).
@return r_value The value of the r^th-rank element of my_list
*/
function rank(my_list, r, top) {
index = IF(top, SIZE(my_list) - r, r - 1);
value = SORT(my_list)[index];
value
} |
LIST_APPEND |
/**
Append insert_list to my_list (at the end).
Examples:
LIST_APPEND(RANGE(-3,3), RANGE(99,102)) = [-3,-2,-1,0,1,2,99,100,101]
@param my_list A list.
@param insert_list List to be appended to my_list.
@return my_longer_list my_list with insert_list appended.
*/
function list_append(my_list, insert_list) {
LIST_INSERT(my_list, insert_list, SIZE(my_list))
} |
DROP_NULLS |
/**
Remove all null elements from my_list, reducing its length.
Examples:
DROP_NULLS([null]) = []
DROP_NULLS([2,2,4,4]) = [2,2,4,4]
DROP_NULLS([null,null,2,2,4,4]) = [2,2,4,4]
DROP_NULLS([‘’]) = [‘’]
DROP_NULLS([‘null’]) = [‘null’]
DROP_NULLS([‘a’, null, null]) = [‘a’]
DROP_NULLS([‘a’, null, ‘a’]) = [‘a’, ‘a’]
@param my_list A list, e.g., [null,null,2,2,4,4].
@return my_drop_nulls_list A list containing only the non-null elements, e.g., [2,2,4,4]
*/
function DROP_NULLS(my_list) {
FILTER(my_list, (x) -> x!=null)
} |
WHICH |
/**
Return a list of indices of my_list elements that are truthy.
Examples:
WHICH([0,10,2,-4.3]) = [1,2,3]
WHICH([0,1,0,1]) = [1,3]
// Strings are false, except LOWER(‘tRuE’)=’true’.
WHICH([FALSE,’hi’,FALSE,’hi’]) = []
WHICH([‘tRuE’,TRUE,null]) = [0,1]
WHICH(null) = null
WHICH([]) = []
WHICH([null]) = []
WHICH(‘string’) = null
WHICH(‘tRuE’) = null
@param my_list Numeric list, e.g. [0,10,2,-4.3].
@return indices List of indices of elements with truthy values, e.g., [1,2,3].
*/
function which(my_list) {
// Return the list indices that are truthy.
idx = RANGE(0,SIZE(my_list));
FILTER(idx, (x) -> IF(my_list[x], TRUE, null))
} |
WHICH_MAX |
/**
Return the index corresponding to the maximum value of my_list.
Examples:
WHICH_MAX([0,10,2,-4.3]) = [1]
WHICH_MAX([1,3,4,5,2,-4]) = [3]
WHICH_MAX([]) = []
WHICH_MAX(null) = null
WHICH_MAX([null,-2]) = [1]
WHICH_MAX([-2,-2.0,null,-3]) = [0,1]
WHICH_MAX([‘c’,-2]) = [1]
WHICH_MAX([‘c’,’d’]) = []
@param my_list Numeric list, e.g. [0,10,2,-4.3].
@return index Numeric scalar, e.g., 1.
*/
function which_max(my_list) {
// Return the index of the maximum value.
max_value = AG_MAX(my_list);
index = WHICH(MAP(my_list, (x) -> IF(x=max_value, 1, 0)));
index
} |
IS_NUMERIC |
/**
Test whether x can be cast to numeric/double: TO_DOUBLE(x)!=null.
Examples:
IS_NUMERIC(‘c’) = FALSE
IS_NUMERIC([‘c’]) = FALSE
IS_NUMERIC({‘c’:1}) = FALSE
IS_NUMERIC(null) = FALSE
IS_NUMERIC(‘1’) = TRUE
IS_NUMERIC(‘-1.23’) = TRUE
IS_NUMERIC(1) = TRUE
IS_NUMERIC(0) = TRUE
IS_NUMERIC(-1) = TRUE
IS_NUMERIC(-1.23) = TRUE
@param x An object.
@return is_numeric A boolean denoting whether x can be cast to numeric/double.
*/
function IS_NUMERIC(x) {
// If x can be cast to double without returning null.
IF(TO_DOUBLE(x)=null, FALSE, TRUE)
} |
MAP_VALUES |
/**
Retrieve a list of the keys of my_map.
Examples:
MAP_VALUES(JSON_TO_MAP(‘{“key1”:”value1”,”key2”:”value2”}’)) = [‘value1’,’value2’]
MAP_VALUES({‘foo’: 1, ‘bar’: 2}) = [2, 1]
MAP_VALUES({‘’: 1, ‘bar’: 2}) = [1, 2]
MAP_VALUES({‘’: 1, ‘’: 2}) = [2]
@param my_map A map, e.g., {cat=1, num=2}.
@return keys List of values, e.g., [‘1’,’2’].
*/
function map_values(my_map) {
MAP_TO_LIST(my_map, (k,v) -> v)
} |
MODE |
/**
Return a list containing the mode or most common value(s), in case
of ties, of my_list.
Examples:
MODE([1,2,2,2,4,4,4]) = [2,4]
MODE([1,2.0,2,2,4,4,4]) = [2,4]
MODE([‘1’,’2’,’2’,’2’,’4’,’4’,’4’]) = [2,4]
MODE([‘1’,’L2’,’L2’,’L2’,’4’,’4’,’4’]) = [4,’L2’]
MODE([1]) = [1]
MODE(null) = [] // TODO: MODE() appears to ignore nulls by default.
MODE([null]) = [] // TODO: MODE() appears to ignore nulls by default.
MODE([‘’]) = [‘’]
MODE([‘c’]) = [‘c’]
MODE([‘c’,1]) = [1,’c’]
MODE([1, ‘’]) = [‘’,1]
MODE([]) = []
MODE([1, null]) = [1]
@param my_list A list, e.g., [1,2,2,2,4,4,4].
@return mode A list containing the mode or most common value of my_list.
*/
function mode(my_list) {
my_map_hist = COUNT_UNIQ(STRING_LIST(my_list));
mode_indices = WHICH_MAX(MAP_VALUES(my_map_hist));
my_map_keys = MAP_KEYS(my_map_hist);
values = MAP(mode_indices,
(x) -> v = my_map_keys[x];
IF(IS_NUMERIC(v), TO_DOUBLE(v), v));
values
} |
MODE_NS |
/**
Return a list containing the mode or most common value(s), in case
of ties, of my_list, ignoring nulls.
Examples:
MODE([null,null,2,2,2,4,4,4]) = [2,4]
MODE([null,null,1,2.0,2,2,4,4,4]) = [2,4]
MODE([null,null,’1’,’2’,’2’,’2’,’4’,’4’,’4’]) = [2,4]
MODE([null,null,’1’,’L2’,’L2’,’L2’,’4’,’4’,’4’]) = [4,’L2’]
MODE(null) = [] // TODO: MODE() appears to ignore nulls by default.
MODE([null]) = [] // TODO: MODE() appears to ignore nulls by default.
MODE([‘’]) = [‘’]
MODE([‘c’]) = [‘c’]
MODE([‘c’,1]) = [1,’c’]
MODE([]) = []
@param my_list A list, e.g., [null,null,2,2,2,4,4,4].
@return mode A list containing the mode or most common value of my_list.
*/
function mode_ns(my_list) {
MODE(DROP_NULLS(my_list))
} |
BIN_INDEX |
/**
Return the index of the bin_edges where x is between bin_edges[x_index] and
bin_edges[x_index + 1]. Return -1 if x is less than bin_edges[0].
Examples:
BIN_INDEX(2, [-2,0,3,10,20]) = 1 // (index) because 0<=2<3.
BIN_INDEX(30 * 0.8 - 5, [-2,0,3,10,20]) = 3
BIN_INDEX(null, [0,1]) = -1
BIN_INDEX(‘’, [0,1]) = -1
BIN_INDEX(‘abc’, [0,1]) = 1
BIN_INDEX(‘-1’, [0,1]) = -1
BIN_INDEX(‘2’, [0,1]) = 1
BIN_INDEX(‘0’, [null,1]) = 0
BIN_INDEX(‘0’, [null,null]) = 1
BIN_INDEX(‘0’, [‘’,’’]) = 1
BIN_INDEX(‘0’, [-100,-1]) = 1
BIN_INDEX(‘0’, [-1,-100]) = 1 // TODO: -1?
BIN_INDEX(‘1’, [1,3,2]) = 0 // TODO: -1?
@param x Scalar.
@param base A pre-sorted (increasing) numeric list, e.g., [-2,0,3,10,40].
@return x_index The log-bin that x falls into in units of x.
*/
function bin_index(x, bin_edges) {
num_edges = SIZE(bin_edges);
idx=RANGE(0,num_edges - 1);
indices = FILTER(idx,
(i) -> BETWEEN(x,
bin_edges[i],
bin_edges[i + 1]));
// Deal with the first and last bin edges.
IF(x < bin_edges[0],
-1,
IF(bin_edges[num_edges - 1] <= x,
num_edges - 1,
indices[0]))
} |
LIST_INSERT_VALUE |
/**
Insert value into my_list at index, pushing subsequent elements down by one
(+1) index.
Examples:
LIST_INSERT_VALUE(RANGE(-3,3), 100, 3) = [-3,-2,-1,100,0,1,2]
LIST_INSERT_VALUE([], null, 0) = [null]
LIST_INSERT_VALUE([], ‘’, 0) = [‘’]
LIST_INSERT_VALUE([null], ‘’, 0) = [‘’, null]
LIST_INSERT_VALUE([‘’], null, 0) = [null, ‘’]
LIST_INSERT_VALUE([‘’], ‘’, 0) = [‘’, ‘’]
LIST_INSERT_VALUE([0,1], null, 0) = [null, 0, 1]
LIST_INSERT_VALUE([], ‘a’, 0) = [‘a’]
LIST_INSERT_VALUE([], [], 0) = [[]]
@param my_list A list.
@param value Value to be inserted into my_list.
@param index Index at which value will be inserted.
@return my_longer_list my_list with value inserted at index.
*/
function list_insert_value(my_list, value, index) {
list_size = SIZE(my_list);
indices = RANGE(0,list_size + 1);
MAP(indices,
(i) -> IF(i < index,
my_list[i],
IF(i = index,
value,
my_list[i - 1])))
} |
LIST_PREPEND_VALUE |
/**
Append value to my_list (at the beginning), pushing subsequent elements down
by one (+1) index.
Examples:
LIST_PREPEND_VALUE(RANGE(-3,3), 100) = [100,-3,-2,-1,0,1,2]
LIST_PREPEND_VALUE([], null) = [null]
LIST_PREPEND_VALUE([], ‘’) = [‘’]
LIST_PREPEND_VALUE([null], ‘’) = [‘’, null]
LIST_PREPEND_VALUE([‘’], null) = [null, ‘’]
LIST_PREPEND_VALUE([‘’], ‘’) = [‘’, ‘’]
LIST_PREPEND_VALUE([0,1], null) = [null, 0, 1]
LIST_PREPEND_VALUE([], ‘a’) = [‘a’]
LIST_PREPEND_VALUE([], []) = [[]]
@param my_list A list.
@param value Value to be prepended to my_list.
@return my_longer_list my_list with value prepended.
*/
function list_prepend_value(my_list, value) {
LIST_INSERT_VALUE(my_list, value, 0)
} |
BIN_VALUE |
/**
Return the value of the largest bin_edge <= x.
For x < the smallest bin_edge, return smallest bin_edge - 1.
Examples:
BIN_VALUE(2, [-2,0,3,10,20]) = 0 // (value) because 0<=2<3.
BIN_VALUE(30 * 0.8 - 5, [-2,0,3,10,20]) = 10
BIN_VALUE(0, [0, 1]) = 0
BIN_VALUE(-2, [0, 1]) = -1
BIN_VALUE(1, [0, 1]) = 1
BIN_VALUE(2, [0, 1]) = 1
BIN_VALUE(null, [0, 1]) = -1
BIN_VALUE(0, [3, null]) = 2
BIN_VALUE(0, [10, 1]) = 9 // TODO: bins should be monotonic?
@param x The numerical value you would like to bin.
@param bin_edges A list of increasing numbers that define the smaller (inclusive) edges of the bins.
@return bin_edge The value of the largest bin_edge <= x.
*/
function bin_value(x, bin_edges) {
// TODO: Check if redundant with WHICH_BIN().
// bin_edges = [0, 1, 2, 3, 4, 5, 8, 10];
bin_index = BIN_INDEX(x, bin_edges);
// Set the bin for ‘less than bin_edges[0]’ to bin_edges[0] - 1, since
// we can’t know the true minimum value of all numbers in the bin.
bin_edges = LIST_PREPEND_VALUE(bin_edges, bin_edges[0] - 1);
// Return the value of the bin, not the index.
// Increase the bin_index by 1—BIN_INDEX() returns -1, 0, … SIZE(bin_index) - 1.
// We prepended a value for the smallest bin, so we shift up all indices by 1.
bin_edges[bin_index + 1]
} |
STR |
/**
Rename to_string() as just str().
@param x Scalar or string.
@return y Numeric scalar.
*/
function str(x) {
TO_STRING(x)
} |
WHICH_BIN_INDEX |
/**
Return the maximum index of the bin_edges where x is between bin_edges[x_index]
and bin_edges[x_index + 1] (smaller and larger edges of the bin). Return -1 if x is
less than bin_edges[0].
Note: If bin_edges is not sorted, return the largest index where bid_edges[index]<=x.
Examples:
WHICH_BIN_INDEX(2, [-2,0,3,10,20]) = 1 // (index) because 0<=2<3.
WHICH_BIN_INDEX(30 * 0.8 - 5, [-2,0,3,10,20]) = 3
WHICH_BIN_INDEX(0, [1,2]) = -1
WHICH_BIN_INDEX(1, [1,2]) = 0
WHICH_BIN_INDEX(2, [1,2]) = 1
WHICH_BIN_INDEX(3, [1,2]) = 1
WHICH_BIN_INDEX(1, [2,1]) = 1 // TODO: -1. Bins not sorted.
WHICH_BIN_INDEX(1, [0,2,1]) = 2 // TODO: -1. Bins not sorted.
WHICH_BIN_INDEX(1, [null,1,2]) = 1
WHICH_BIN_INDEX(1, [1,null,2]) = 1 // TODO: 0. null should be ignored.
WHICH_BIN_INDEX(1, [1,2,null]) = 2 // TODO: 0. null should be ignored.
WHICH_BIN_INDEX(1, [‘c’,1,2]) = 1
WHICH_BIN_INDEX(1, [1,’c’,2]) = 0
WHICH_BIN_INDEX(1, [1,2,’c’]) = 0
WHICH_BIN_INDEX(null, [1,2]) = -1 // TODO: null. null should be null.
WHICH_BIN_INDEX([], [1,2]) = 1 // TODO: null. [] should be null. Or throw an error.
WHICH_BIN_INDEX(1, []) = -1 // TODO: null or -1? No bins should be null, or less than the smallest bin.
WHICH_BIN_INDEX(1, null) = -1 // TODO: null or -1? No bins should be null, or less than the smallest bin.
@param x Scalar.
@param bin_edges A pre-sorted (increasing) numeric list, e.g., [-2,0,3,10,40].
@return x_index The index of the bin that x falls into in units of x.
*/
function which_bin_index(x, bin_edges) {
COALESCE(AG_MAX(WHICH(MAP(bin_edges, (b) -> IF(b <= x, 1, 0)))), -1)
} |
WHICH_BIN |
/**
Retrieve the left (smaller) edge of the bin where bin_edges[k] <= x < bind_edges[k+1].
Return the value bin_edge where x is between bin_edges[k] and bin_edges[k + 1]
(smaller and larger edges of the bin). Return bin_edges[0] - 1 if x is less than
bin_edges[0].
Examples:
WHICH_BIN(2, [-2,0,3,10,20]) = 0 // (value) because 0<=2<3.
WHICH_BIN(30 * 0.8 - 5, [-2,0,3,10,20]) = 10
WHICH_BIN(0, [10,20]) = 9
WHICH_BIN(10, [10,20]) = 10
WHICH_BIN(20, [10,20]) = 20
WHICH_BIN(30, [10,20]) = 20
WHICH_BIN(10, [20,10]) = 10 // TODO: 10? Bins not sorted.
WHICH_BIN(10, [0,20,10]) = 10 // TODO: 10? Bins not sorted.
WHICH_BIN(10, [null,10,20]) = 10
WHICH_BIN(10, [10,null,20]) = null // TODO: 10. null should be ignored.
WHICH_BIN(10, [10,20,null]) = null // TODO: 10. null should be ignored.
WHICH_BIN(10, [‘c’,10,20]) = 10
WHICH_BIN(10, [10,’c’,20]) = 10
WHICH_BIN(10, [10,20,’c’]) = 10
WHICH_BIN(null, [10,20]) = 9 // TODO: null. null should be null.
WHICH_BIN([], [10,20]) = 20 // TODO: null. [] should be null. Or throw an error.
WHICH_BIN(10, []) = null // No bins, no bin value.
WHICH_BIN(10, null) = null // No bins, no bin value.
@param x Scalar.
@param base A pre-sorted (increasing) numeric list, e.g., [-2,0,3,10,40].
@return bin_edge The bin that x falls into in units of x.
*/
function which_bin(x, bin_edges) {
index = WHICH_BIN_INDEX(x, bin_edges);
IF(index>=0, bin_edges[index], bin_edges[0] - 1)
} |
LIST_SUB |
/**
Subset my_list to the elements in indices.
Examples:
LIST_SUB([‘a’,’b’,’c’,’d’], [0,3]) = [‘a’,’d’]
LIST_SUB(null,[]) = null
LIST_SUB([],null) = null
[][null] = null
null[null] = null
null[[]] = null
@param my_list A list.
@param indices A list of integer indices.
@return list_subset List with only elements in indices.
*/
function list_sub(my_list, indices) {
IF(my_list=null, null, MAP(indices, (x) -> my_list[x]))
} |
MOST_RECENT_TIES |
/**
Find and return a list of the most recent event(s) of a given type.
Examples:
// No recent records from the first record.
MOST_RECENT_TIES(timestamp, ‘request’) = []
// Current record is the most recent from itself + 1 millisecond.
SIZE(MOST_RECENT_TIES(timestamp+1, type)) = 1
// First record is the most recent from itself + 1 millisecond.
MOST_RECENT_TIES(timestamp+1, type) = [${@timeline.events[0]}]
// First record has no events preceding it.
MOST_RECENT_TIES(timestamp, type) = []
// First record is not most recent to itself.
MOST_RECENT_TIES(timestamp, type) != timeline.events[0] = TRUE
MOST_RECENT_TIES(timestamp - 1, type) != timeline.events[0] = TRUE
// Erroneous inputs return the empty list (no recent event).
MOST_RECENT_TIES(timestamp, ‘INVALID_TYPE’) = []
MOST_RECENT_TIES(timestamp, ‘’) = []
MOST_RECENT_TIES(timestamp, null) = []
MOST_RECENT_TIES(null, type) = []
MOST_RECENT_TIES(null, null) = []
MOST_RECENT_TIES(0, type) = []
// // Collect the third event.
// third_event = timeline.events[2];
// // Overwrite the first two events with the third event’s timestamp & type.
// ignore_output_var = MAP([0,1],
// (i) -> SET_PROPERTY(timeline.events[i],’type’,third_event.type);
// SET_PROPERTY(timeline.events[i],’timestamp’,third_event.timestamp);
// ‘ignored_output’);
//
// // See whether there are multiple most recent events
// // and whether the 3rd one is last in the list.
// recent_events = MOST_RECENT_TIES(third_event.timestamp + 1, third_event.type);
// (recent_events[2] = third_event AND SIZE(recent_events) = 3)
TRUE
@param current_time Timestamp in milliseconds.
@param filter_type String denoting the event type, e.g., ‘request’.
@return event_list_including_ties A list of events to accommodate potential ties.
*/
function most_recent_ties(current_time, filter_type) {
filtered_events = FILTER(timeline.events,
(x) -> x.type=filter_type);
time_filtered_events = FILTER(filtered_events,
(x) -> x.timestamp < current_time);
filtered_timestamps = MAP(time_filtered_events,
(x) -> x.timestamp);
event_list_including_ties = MAP(WHICH_MAX(filtered_timestamps),
(x) -> time_filtered_events[x]);
event_list_including_ties
} |
MOST_RECENT |
/**
Find and return the most recent event of a given type using a uniform random
tie-breaker.
Examples:
// No recent records from the first record.
MOST_RECENT(timestamp, ‘request’) = null
// First record is the most recent from itself + 1 millisecond.
MOST_RECENT(timestamp+1, type) = ${@timeline.events[0]}
// First record has no events preceding it.
MOST_RECENT(timestamp, type) = null
// First record is not most recent to itself.
MOST_RECENT(timestamp, type) != timeline.events[0]
MOST_RECENT(timestamp - 1, type) != timeline.events[0]
// Erroneous inputs return null (no recent event).
MOST_RECENT(timestamp, ‘INVALID_TYPE’) = null
MOST_RECENT(timestamp, ‘’) = null
MOST_RECENT(timestamp, null) = null
MOST_RECENT(null, type) = null
MOST_RECENT(null, null) = null
MOST_RECENT(0, type) = null
// // Collect the third event.
// third_event = timeline.events[2];
// // Overwrite the first two events with the third event’s timestamp & type.
// ignore_output_var = MAP([0,1],
// (i) -> SET_PROPERTY(timeline.events[i],’type’,third_event.type);
// SET_PROPERTY(timeline.events[i],’timestamp’,third_event.timestamp);
// ‘ignored_output’);
//
// // Confirm that the most recent event from 1 millisecond after the third event
// // is the third event and that the third event is not most recent from itself.
// // TODO: Swap the first logical below with = third_event. Go backwards in the
// // list of events, rather than grabbing the first tie.
// (MOST_RECENT(third_event.timestamp + 1, third_event.type) = timeline.events[0] // first_event
// AND MOST_RECENT(third_event.timestamp, third_event.type) != third_event)
TRUE
@param current_time Timestamp in milliseconds.
@param filter_type String denoting the event type, e.g., ‘request’.
@return most_recent_event A single event using a uniform random tie-breaker.
*/
function most_recent(current_time, filter_type) {
most_recent_events = MOST_RECENT_TIES(current_time, filter_type);
idx = FLOOR(RANDOM() * SIZE(most_recent_events));
most_recent_event = most_recent_events[idx];
most_recent_event
} |
PREV |
/**
Return the next event with the same type as my_event.
Examples:
NEXT(self) != self // The next event is not the current event.
NEXT(timeline.events[0]) = NEXT_EVENT_(timeline.events[0].timestamp, timeline.events[0].type)
@param my_event An event from timeline.events or the current in-focus event, event / self.
@return next_event The next event of the same type as my_event.
*/
function prev(my_event) {
MOST_RECENT(my_event.timestamp, my_event.type)
}
// Future version.
// function prev(my_event) {
// idx = FILTER(RANGE_STEP(my_event.index, -1, -1),
// (i) -> timeline.events[i].type = my_event.type,
// (n,x) -> n > 1)[0]
// timeline.events[idx]
// } |
CUMSUM |
/**
Return a list of the partial cumulative sum of the elements of my_list.
Examples:
CUMSUM(RANGE(0,6)) = [0,1,3,6,10,15]
CUMSUM([null]) = [null] // TODO: [null] or null?
CUMSUM([null, null]) = [null, null] // TODO: [null] or null?
CUMSUM([‘’, null]) = [null,null]
CUMSUM([‘’, ‘’]) = [null,null]
CUMSUM([1, 2, ‘3’]) = [1,3,6.0]
CUMSUM([1, 2, null]) = [1, 3, null]
CUMSUM([1, null, 2]) = [1, null, null]
CUMSUM([1, 2, ‘’]) = [1, 3, null]
CUMSUM([‘a’]) = [null]
CUMSUM([‘1/0’]) = [null]
@param my_list A numeric list.
@return my_list_cumsum The partial/cumulative sum of my_list for elements 0 through index.
*/
function cumsum(my_list) {
my_sum = 0; // Initialize in the surrounding context.
// my_sum locally increments using the surrounding context of ‘my_sum’ to initialize.
MAP(my_list, (x) -> my_sum = my_sum + x; my_sum)
// my_sum = 0 since all of the changes to my_sum were local inside the lambda expression.
} |
EXPAND_COLS_FORMAT |
/**
Return a list of [prefix0:value[0], prefix1:value[1], …]. Such lists of
key:value pairs can be easily expanded into multiple columns when building
datasets.
prefix=’bs’ is a common use case when working with bootstrap
ensembles.
Examples:
EXPAND_COLS_FORMAT(‘bs’,[-1,0,1]) = [‘bs0:-1.0’,’bs1:0’,’bs2:1’]
EXPAND_COLS_FORMAT(‘z’,[‘A’,’B’,’C’]) = [‘z0:A’,’z1:B’,’z2:C’]
@param prefix An input string such as ‘bs’.
@param values A list of values to be prefixed and enumerated into key-value pairs.
@return expandable_list A list of key:value pairs compatible with expand_columns.
*/
function expand_cols_format(prefix, values) {
// List of [bs0:values[0], bs1:values[1], …] for prefix=’bs’.
MAP(RANGE(0,SIZE(values)), (i) -> CONCAT(prefix, i, ‘:’, values[i]))
} |
DATETIME_RANGE |
/**
Return a list of numbers from start to end in steps of step.
Examples:
start=’2020-01-01 00:00:00.000’;
end='2020-01-03 00:00:00.001';
step=86400000; // 1 day in milliseconds
DATETIME_RANGE(start, end, step)
= ['2020-01-01 00:00:00', '2020-01-02 00:00:00', '2020-01-03 00:00:00']
// Without RANGE_STEP().
start=MILLIS(TO_DATETIME(‘2020-01-01 00:00:00.000’));
end=MILLIS(TO_DATETIME('2020-01-03 00:00:00.001'));
step=86400000; // 1 day in milliseconds
num_steps=(end-start) / step;
MAP(RANGE(0,CEIL(num_steps)),
(x) -> FROM_UNIXTIME(x * step + start))
= ['2020-01-01 00:00:00', '2020-01-02 00:00:00', '2020-01-03 00:00:00']
@param start Datetime-parseable string, e.g., ‘2020-01-01 00:00:00.000’.
@param end Datetime-parseable string, e.g., ‘2020-02-02 00:00:00.001’.
@param step Numeric scalar in milliseconds.
@return step_list Datetime string list, e.g., [‘2020-05-01 00:00:00’, ‘2020-05-02 00:00:00’, …].
*/
function datetime_range(start, end, step) {
start = MILLIS(TO_DATETIME(start));
end = MILLIS(TO_DATETIME(end));
MAP(RANGE_STEP(start, end, step), (x) -> FROM_UNIXTIME(x))
} |
ANY |
/**
Return TRUE if any element of my_list is truthy.
Examples:
ANY([0,10,2,-4.3]) = TRUE
ANY([1,3,4,5,2,-4]) = TRUE
ANY([0]) = FALSE
ANY([‘tRuE’]) = TRUE
ANY([‘hi’]) = FALSE // All other strings are false.
ANY([null]) = FALSE
ANY([]) = FALSE
ANY(null) = null
@param my_list List, e.g. [0,10,2,-4.3].
@return any_true Boolean
*/
function any(my_list) {
any_true = False;
out = MAP(my_list,
(x) -> any_true = any_true OR x;
any_true);
// Always return False for empty lists, [].
IF(SIZE(my_list)=0, False, out[-1])
} |
WHICH_MIN |
/**
Return the index corresponding to the minimum value of my_list.
Examples:
WHICH_MIN([0,10,2,-4.3]) = [3]
WHICH_MIN([1,3,4,5,2,-4]) = [5]
WHICH_MIN([]) = []
WHICH_MIN(null) = null
WHICH_MIN([null,-2]) = [1]
WHICH_MIN([-2,-2.0,null,-3]) = [3]
WHICH_MIN([‘c’,-2]) = [1]
WHICH_MIN([‘c’,’d’]) = []
@param my_list Numeric list, e.g. [0,10,2,-4.3].
@return index Numeric scalar, e.g., 3.
*/
function which_min(my_list) {
min_value = AG_MIN(my_list);
index = WHICH(MAP(my_list, (x) -> IF(x=min_value, 1, 0)));
index
} |
NEXT_EVENT_TIES |
/**
Find and return a list of the next event(s) of a given type.
Examples:
// There should be a next event of type=’request’ after the first record.
NEXT_EVENT_TIES(timestamp, ‘request’) != []
// Current record is the next event from itself - 1 millisecond.
SIZE(NEXT_EVENT_TIES(timestamp-1, type)) >= 1
// First record is the next event from itself - 1 millisecond.
NEXT_EVENT_TIES(timestamp-1, type) = [timeline.events[0]]
@param current_time Timestamp in milliseconds.
@param filter_type String denoting the event type, e.g., ‘request’.
@return event_list_including_ties A list of events to accommodate potential ties.
*/
function next_event_ties(current_time, filter_type) {
filtered_events = FILTER(timeline.events,
(x) -> x.type=filter_type);
time_filtered_events = FILTER(filtered_events,
(x) -> x.timestamp > current_time);
filtered_timestamps = MAP(time_filtered_events,
(x) -> x.timestamp);
event_list_including_ties = MAP(WHICH_MIN(filtered_timestamps),
(x) -> time_filtered_events[x]);
event_list_including_ties
} |
NEXT_EVENT_ |
/**
Deprecated: use next_event(filter_type)
Find and return the next event of a given type using a uniform random tie-breaker.
Examples:
// There should be a next event of type=’request’ after the first record.
NEXT_EVENT_(timestamp, ‘request’) != []
// First record is the next event from itself - 1 millisecond.
NEXT_EVENT_(timestamp-1, type) = ${@timeline.events[0]}
@param current_time Timestamp in milliseconds.
@param filter_type String denoting the event type, e.g., ‘request’.
@return next_event A single event using a uniform random tie-breaker.
*/
function next_event_(current_time, filter_type) {
next_events = NEXT_EVENT_TIES(current_time, filter_type);
idx = FLOOR(RANDOM() * SIZE(next_events));
next_event = next_events[idx];
next_event
} |
INT_LIST |
/**
Takes a list of numbers and returns a list with the numbers
converted to integers.
Examples:
INT_LIST([0.2,-1.3,5]) = [0,-1,5] // Toward zero.
@param my_list A list of numbers.
@return my_int_list my_list converted to integers.
*/
function int_list(my_list) {
MAP(my_list, (x) -> TO_INT(x))
} |
LIST_REPLACE |
/**
Replace the elements of my_list with replace_list starting
at index. If replace_list is longer than the remaining
elements of my_list, allow overflow, returning a longer list.
Examples:
LIST_REPLACE(RANGE(-3,3), RANGE(99,102), 3) = [-3,-2,-1,99,100,101]
@param my_list A list.
@param replace_list Replacement list elements.
@param index Starting index for replacing my_list elements with replace_list.
@return my_replace_list my_list with replacements by replace_list starting at index.
*/
function list_replace(my_list, insert_list, index) {
list_size = SIZE(my_list);
insert_size = SIZE(insert_list);
indices = RANGE(0, index + insert_size);
MAP(indices,
(i) -> IF(i < index,
my_list[i],
IF(i < index + insert_size,
insert_list[i - index],
my_list[i])))
} |
IN |
/**
Return TRUE if x is equal to any element of my_list.
Examples:
IN(1, [‘a’,’b’,’c’]) = False
IN(‘b’, [‘a’,’b’,’c’,’c’]) = True
IN(1, [1,2,3]) = True
IN(1.0, [1,2,3]) = True
IN(1.2, [1.2,2,3]) = True
NOT IN(1, [1,2,3]) = False // NOT operator
NOT(IN(1, [1,2,3])) = False // NOT() UDF
IN(1.0, null) = null
IN(1.0, []) = False
IN(1.0, [null]) = False
IN(null, [null]) = True
IN(null, [1,2,3]) = False
IN(null, null) = null
@param my_list List, e.g. [0,10,2,-4.3].
@return x_in_list Boolean
*/
function IN(x,list) {
IF(list=null, null, ANY(MAP(list, (v) -> x=v)))
} |
INV_EXP |
/**
Return the quantile q of the exponential distribution of the probability p.
Examples:
INV_EXP(0.5) = 0.6931471805599453 // Median = natLog(2)
@param p Numeric scalar, e.g., 0.5.
@return q Numeric scalar, e.g., natLog(2)=0.69314718056.
*/
function inv_exp(p) {
// Inverse CDF (quantile function) of the exponential distribution.
-1 * LOG(1 - p, EXP(1))
} |
LIST_REMOVE_VALUE |
/**
Remove the element of my_list at index, pulling subsequent
elements up by one (-1) index.
Examples:
LIST_REMOVE_VALUE(RANGE(-3,3), 3) = [-3,-2,-1,1,2]
@param my_list A list.
@param index Index that value will replace.
@return my_remove_list my_list with the value of at index removed.
*/
function list_remove_value(my_list, index) {
list_size = SIZE(my_list);
indices=LIST_UNION(RANGE(0,index),
RANGE(index + 1,list_size));
LIST_SUB(my_list, indices)
} |
STDEV_BOOTSTRAP |
/**
Computes the standard deviation of the list of values, omitting the first
(0th) element, presuming it comes from the base model and, hence, is not a
bootstrap iteration.
Examples:
STDEV_BOOTSTRAP([-1,0,1,2]) = STDEV([0,1,2])
@param values A list of values.
@return stdev Standard deviation of the elements of values, except the first.
*/
function STDEV_BOOTSTRAP(values) {
// Remove the 0th index, the main model, which is not one of the bootstraps.
STDEV(LIST_REMOVE_VALUE(values,0))
} |
STRLEN |
/**
Return the number of characters in my_string.
Examples:
STRLEN(‘Hello World!’) = 12
@param my_string A string.
@return num_chars Integer number of characters in my_string.
*/
function strlen(my_string) {
// TODO: This should use a Java built-in function rather than counting regex matches.
SIZE(STRING_SPLIT(my_string,’’))
} |
SUM_NS |
/**
Return the sum of my_list, ignoring nulls.
Examples:
SUM_NS([null,null,2,2,2,4,4,4]) = 18
SUM_NS(null) = null
SUM_NS([null]) = 0
SUM_NS([‘’]) = null // TODO: 0. Strings should be ignored.
SUM_NS([‘c’]) = null // TODO: 0. Strings should be ignored.
SUM_NS([‘c’,1]) = null // TODO: 1. Strings should be ignored.
SUM_NS([]) = 0
@param my_list A list, e.g., [null,null,2,2,2,4,4,4].
@return sum The sum of non-null elements of my_list.
*/
function sum_ns(my_list) {
SUM(DROP_NULLS(my_list))
} |
ZERO_IMPUTE |
/**
Replace all null values of my_list with zero.
Examples:
ZERO_IMPUTE([null,null,2,4]) = [0,0,2,4]
@param my_list A list, e.g., [null,null,2,4].
@return my_imputed_list A list with nulls imputed using zero.
*/
function zero_impute(my_list) {
MAP(my_list, (x) -> COALESCE(x,0))
} |
PARSE_KV |
/**
Parse a key:value pair and return a map {‘cat’:’key’, ‘num’:value}. The UDF handles
certain cases where the key or value are empty. For example, if k:v is only v,
assume v is numerical.
Examples:
PARSE_KV(‘key:3.0’) = {‘cat’:’key’, ‘num’:3.0}
PARSE_KV(‘key:’) = {‘cat’:’c’, ‘num’:1.0}
PARSE_KV(‘key’) = {‘cat’:’c’, ‘num’:1.0}
PARSE_KV(‘3.0’) = {‘cat’:’c’, ‘num’:3.0}
PARSE_KV(3.0) = {‘cat’:’c’, ‘num’:3.0}
PARSE_KV(‘key:3.0:1’) = {‘cat’:’key’, ‘num’:3.0}
PARSE_KV(‘key:key2:1’) = {‘cat’:’key’, ‘num’:1.0} // Default numeric since ‘key2’ is nonnumeric.
@param kv Key-value pair string, e.g., key:3.0, where the value is numerical.
@return kv_map A map with {‘cat’:’key’, ‘num’:value}.
*/
function PARSE_KV(kv) {
// Parse a key:value pair and return a map {‘cat’:’key’, ‘num’:value}.
default_categorical=’c’;
default_numerical=1.0;
key_values = STRING_SPLIT(TO_STRING(kv), ‘:’); // Split k:v.
// Try to find a categorical key in k:v—default to ‘c’.
categorical = IF(SIZE(key_values)>1, key_values[0], default_categorical);
// Multiply by the value in the categorical:numerical k:v pair.
// If k:v is only v, assume it is numerical.
idx = IF(SIZE(key_values) > 2, 1, -1); // If ‘k:v1:v2’ or ‘k1:k2:v’, just use the second key.
numerical = COALESCE(TO_DOUBLE(key_values[idx]), default_numerical);
// Create map: {cat:key, num:value}.
kv_map = {‘cat’:categorical, ‘num’:numerical};
kv_map
} |
PARSE_KV_NUM |
/**
Return the numerical (value) component of a key-value pair string kv.
Examples:
PARSE_KV_NUM(‘key:3.0’) = 3.0
@param kv Key-value pair string, e.g., ‘key:3.0’, where the value is numerical.
@return num The value component of kv.
*/
function parse_kv_num(kv) {
TO_DOUBLE(MAP_VALUE(PARSE_KV(kv),’num’))
} |
SUM_MAP |
/**
Return the sum of values of a map of key=value pairs.
Examples:
my_map = JSON_TO_MAP(‘{“key1”:10,”key2”:2}’);
SUM_MAP(my_map) = 'sum:12.0'
my_list = [1,1,2,2,3,3,3,3];
my_map = COUNT_UNIQ(INT_LIST(my_list));
SUM_MAP(my_map) = 'sum:8.0'
@param my_map A map, e.g., {cat=1, num=2}.
@return kv_string A key-value pair as a string, e.g., num:2.
*/
function sum_map(my_map) {
// SUM(my_list) // Works for lists.
// SUM(my_map) // Doesn’t work for maps.
map_string = MAP_TO_LIST(my_map,
(k,v) -> CONCAT_WS(‘:’,k,v));
REDUCE(map_string,
(x,z) -> CONCAT_WS(‘:’, ‘sum’, PARSE_KV_NUM(x) + PARSE_KV_NUM(z)))
} |
LIST_MINUS |
/**
Elementwise list subtraction/’minus’: list1 - list2.
Examples:
LIST_MINUS([1,-1],[1,1]) = [0,-2]
@param list1 A list of numbers.
@param list2 A list of numbers.
@return list_out list1 - list2
*/
function list_minus(list1, list2) {
MAP(RANGE(0,SIZE(list1)), (i) -> list1[i] - list2[i])
} |
COUNT |
/**
function count(my_list) {
// Alias for SIZE().
SIZE(my_list)
} |
WINDOW_EVENTS |
/**
Return a list of all events of a given event_type occurring between
days_start and days_end, relative to current_time. Included
events have timestamps between current_time + (days_start, days_end).
Examples:
// For testing with the first event, no records fall in the window.
WINDOW_EVENTS(timestamp, ‘request’, -7, -3) = [] // Between 3 and 7 days ago.
// However, 5 days in the future from the first event, the current record falls in the window.
SIZE(WINDOW_EVENTS(timestamp + 5864001000, type, -7, -3)) >= 1 // Between 3 and 7 days ago.
// The current record falls inside a window starting looking back 1 day from 1 millisecond in the future.
WINDOW_EVENTS(timestamp + 1, type, -1, 0) = [${@timeline.events[0]}] // Within the past day, + 1 millisecond.
WINDOW_EVENTS(timestamp + 1, ‘BAD_TYPE’, -7, 0) = []
// Adjust window by -1, 0, and +1 milliseconds.
WINDOW_EVENTS(timestamp - 1, type, 0, 1)[0] = ${@timeline.events[0]} // First event is first returned.
WINDOW_EVENTS(timestamp, type, 0, 1)[0] = ${@timeline.events[0]} // First event is first returned.
WINDOW_EVENTS(timestamp + 1, type, 0, 1)[0] = null
// Adjust the current timestamp to be outside of the window for the current sample.
current = timestamp; SET_PROPERTY(event, ‘timestamp’, 0);
WINDOW_EVENTS(current + 1, type, -1, 0)[0] = null
@param current_time Timestamp in milliseconds.
@param event_type String denoting the event type, e.g., ‘request’.
@param days_start The number of days to add to current_time. Event timestamps are >=.
@param days_end The number of days to add to current_time. Event timestamps are < (strictly).
@return window_events A list of events.
*/
function window_events(current_time, filter_type, days_start, days_end) {
millis_per_day = 24 * 3600 * 1000;
// window_events
FILTER(timeline.events,
(x) -> days_diff = (x.timestamp - current_time) / millis_per_day;
IFRETURN(days_diff > days_end,0);
x.type = filter_type
AND BETWEEN(days_diff, days_start, days_end),
(n,x) -> x.timestamp > current_time + days_end * millis_per_day) // Break upon reaching the end of the window.
} |
RECENT_EVENTS |
/**
Return a list of all events of a given filter_type occurring in recent days.
Examples:
// No records in the past 7 days from the first record.
RECENT_EVENTS(timestamp, ‘request’, 7) = []
// Current record is within the past 7 days from the first record + 1 millisecond.
SIZE(RECENT_EVENTS(timestamp+1, type, 7)) >= 1
// First record is the only record within 7 days of itself + 1 millisecond.
RECENT_EVENTS(timestamp+1, type, 7) = [${@timeline.events[0]}]
@param current_time Timestamp in milliseconds.
@param filter_type String denoting the event type, e.g., ‘request’.
@param days The number of days to look back to include events.
@return recent_events A list of events.
*/
function recent_events(current_time, filter_type, days) {
// Return a list of all recent events of a given ‘filter_type’ in last ‘days’.
recent_events = WINDOW_EVENTS(current_time, filter_type, -1 * days, 0);
recent_events
} |
INV_RESCALE |
/**
Inverse the RESCALE() transformation on y using location
and scale parameters: (y - location) / scale.
Examples:
INV_RESCALE(11,1,2) = 5 // (11 - 1) / 2
@param y Scalar to be transformed.
@param location Scalar.
@param scale Scalar.
@return x (y - location) / scale.
*/
function inv_rescale(y, location, scale) {
(y - location) / scale
} |
PARSE_KV_CAT |
/**
Return the categorical (key) component of a key-value pair string kv.
Examples:
PARSE_KV_CAT(‘key:3.0’) = ‘key’
@param kv Key-value pair string, e.g., ‘key:3.0’.
@return cat The categorical (key) component of kv.
*/
function parse_kv_cat(kv) {
MAP_VALUE(PARSE_KV(kv),’cat’)
} |
LENGTH |
/**
function length(my_list) {
// Alias for SIZE().
SIZE(my_list)
} |
LIST_REVERSE_ |
/**
Deprecated in favor of the faster built-in function: LIST_REVERSE().
Reverse the order of elements in my_list from first-to-last to last-to-first.
Examples:
LIST_REVERSE([‘a’,’b’,’c’,’d’]) = [‘d’,’c’,’b’,’a’]
@param my_list A list.
@return my_list_reverse my_list with elements in reverse order.
*/
function list_reverse_(my_list) {
reverse_indices = RANGE_STEP(SIZE(my_list) - 1, -1, -1);
LIST_SUB(my_list, reverse_indices)
} |
SUBSET_KV |
/**
Given k, a key, extract values[i] corresponding to k==keys[i].
Examples:
SUBSET_KV(‘a’,[‘a’,’a’,’b’,’c’],[1,2,3,4]) = [1,2]
@param k One key.
@param keys A list of keys.
@param values A list of values.
@return k_values The subset of the values list with key k.
*/
function subset_kv(k, keys, values) {
// TODO: Is this redundant with SWITCH(x,cases,outputs)?
matches = MAP(keys, (x) -> x=k);
which_matches = WHICH(matches);
match_values = MAP(which_matches, (i) -> values[i]);
match_values
} |
UNIQ |
/**
Return a list of the unique values in my_list.
Examples:
UNIQ([‘c’,’a’,’b’,’c’,’b’,’c’,’c’]) = [‘a’,’b’,’c’]
UNIQ([1,1.0,2,2.000]) = [1,2]
UNIQ([‘L1’,1.0,2,2.000]) = [1,2,’L1’]
@param my_list A list.
@return uniq_keys A sorted list of the unique values in my_list.
*/
function uniq(my_list) {
counts = COUNT_UNIQ(STRING_LIST(my_list));
keys = MAP_KEYS(counts);
MAP(keys, (x) -> IF(IS_NUMERIC(x), TO_DOUBLE(x), x))
} |
GROUP_BY_SUM |
/**
Like COUNT_UNIQ(), but sum across values, rather than just counting the number of
times each unique element of groups appears.
Examples:
GROUP_BY_SUM([‘1’,’go’,’1’,’go’],[1,2,3,4]) = {‘1’:4.0, ‘go’:6.0}
GROUP_BY_SUM([‘1’,’go’,1,’go’],[1,2,3,4]) = {‘1’:4.0, ‘go’:6.0}
GROUP_BY_SUM([1,2,1.0,2.0],[1,2,3,4]) = {‘1’:4.0, ‘2’:6.0}
GROUP_BY_SUM([1,’2’,’1.0’,2.0],[1,2,3,4]) = {‘1’:4.0, ‘2’:6.0}
@param groups A list of group keys, e.g., [‘1’,’go’,’1’,’go’].
@param values A list of values to be summed within group, e.g., [1,2,3,4].
@return my_map A map of unique groups with values summed across values, e.g., {1=4.0, go=6.0}.
*/
function group_by_sum(groups, values) {
// Collect unique group names.DATE
// For each g in groups:
// group:SUM(values)
// values = [value[i] for x in groups if x==g]
groups = MAP(groups, (g) -> IF(IS_NUMERIC(g), TO_DOUBLE(g), g));
unique_groups = UNIQ(groups);
output = MAP(unique_groups,
(g) -> SUM(SUBSET_KV(TO_STRING(g), STRING_LIST(groups), values)));
LISTS_TO_MAP(unique_groups, output)
} |
SPLIT |
/**
Rename SPLIT_STRING() as just SPLIT().
@param my_string String.
@param pattern Regular-expression pattern on which to split my_string, e.g., ‘ +’.
@return y List of strings resulting from split.
*/
function split(string, pattern) {
STRING_SPLIT(string, pattern)
} |
REXP |
/**
function rexp(scale) {
INV_EXP(RANDOM()) * scale
} |
CLIP |
/**
Clip/censor x between lower and upper.
Examples:
CLIP(-1, 0, 100) = 0
CLIP(111, 0, 100) = 100
CLIP(50, 0, 100) = 50
@param x Scalar to be transformed.
@param lower Scalar.
@param upper Scalar.
@return y y = MIN(upper,MAX(lower,x)).
*/
function clip(x, lower, upper) {
MIN(upper, MAX(lower, x))
} |
MAP_UNION |
/**
Union two maps.
Note: Keys and values cannot have commas ‘,’, equals ‘=’, or braces ‘{’ and ‘}’ due to
JSON parsing.
Examples:
MAP_UNION({‘a’:1,’b’:2}, {‘c’:3}) = {‘a’:1, ‘b’:2, ‘c’:3}
MAP_UNION({‘a’:1,’b’:2}, {}) = {‘a’:1, ‘b’:2}
MAP_UNION({}, {‘c’:3}) = {‘c’:3}
MAP_UNION({}, {}) = {}
MAP_UNION({“key1a”:”value1a”,”key1b”:”value1b”},
{"key2a":"value2a","key2b":"value2b"})
= {"key1a":"value1a", "key1b":"value1b", "key2a":"value2a", "key2b":"value2b"}
@param my_map A map, e.g., {cat=1, num=2}.
@return my_appended_map my_map with the key:value pair appended.
*/
function map_union(my_map1, my_map2) {
// Transform the map to a string.
string_map1 = JSON(my_map1);
string_map2 = JSON(my_map2);
string_map1 = REGEX_REPLACE(string_map1, [‘^[{]’,’’, ‘[}]\(','']);
string_map2 = REGEX_REPLACE(string_map2, ['^[{]','', '[}]\)’,’’]);
string_map = CONCAT_WS(‘,’, string_map1, string_map2);
string_map = REGEX_REPLACE(string_map, [‘^,’,’’, ‘,$’,’’]);
string_map = CONCAT(‘{’, string_map, ‘}’);
JSON_TO_MAP(string_map)
} |
LIST_PREPEND |
/**
Prepend insert_list to my_list (at the beginning), pushing subsequent elements
down by the number of elements of insert_list.
Examples:
LIST_PREPEND(RANGE(-3,3), RANGE(99,102)) = [99,100,101,-3,-2,-1,0,1,2]
@param my_list A list.
@param insert_list List to be prepended to my_list.
@return my_longer_list my_list with insert_list prepended.
*/
function list_prepend(my_list, insert_list) {
LIST_INSERT(my_list, insert_list, 0)
} |
MD5_PARTITIONS |
/**
Use MD5() to map partition_var into a partition denoted by names. The mapping is
quasi-random with respect to partition_var due to MD5()’s hashing properties. Use
shares to assign the probability of assignment to each partition. Doubles as a
weighted uniform hash-deterministic random sampler (e.g., random uniform given a seed).
Examples:
SET_PROPERTY(timeline,’id’,’my_timeline_id’); // For a deterministic example.
// New succinct, extendable, statistically robust UDF.
MD5_PARTITIONS(timeline.id, 'my_seed', [0.008, 0.002], ['train','test']) = 'test'
// Old succinct, undocumented approach with questionable statistical properties.
// IF(HASH(timeline.id) % 100 > 20, "train", "test")
@param partition_var A partitioning expression, e.g., timeline.id.
@param seed String to append to partition_var prior to hashing in order to customize
(synchronize or make independent) the randomization for the application/model/etc.
@param shares List of (relative) shares of the split, e.g., [0.8, 0.2].
@param names List of partition names, e.g., [‘train’, ‘test’].
@return name A single, MD5-hash-weighted-random element of names.
*/
function md5_partitions(partition_var, seed, shares, names) {
sum_shares = SUM(shares);
upper_bin_edges = MAP(CUMSUM(shares), (x) -> x / sum_shares); // Normalize relative shares to sum to one.
lower_bin_edges = LIST_PREPEND_VALUE(upper_bin_edges,0);
md5_string = CONCAT(TO_STRING(partition_var), TO_STRING(seed)); // var + seed.
probability = MD5_MOD(md5_string, 1000) / 1000; // Create 1000 bins (e.g., 0.1% bins).
// Two approaches to getting the record’s partition bin index.
// prob_bin = BIN_INDEX(probability, lower_bin_edges);
prob_which_bin = WHICH_BIN_INDEX(probability, lower_bin_edges);
name = names[prob_which_bin];
name
} |
GSUB |
/**
Simple substitution of find with replace. Replaces all matches using
REGEX_REPLACE().
Examples:
GSUB(’ ‘,’_’,’Hello World!’) = ‘Hello_World!’
@param find String. Can be a regular expression.
@param replace String.
@param my_string String.
@return y String with replacements made.
*/
function gsub(find, replace, my_string) {
// Simple substitution
REGEX_REPLACE(my_string, [find, replace])
} |
NEXT_EVENT_SORTED |
/**
Find and return the next event of a given type, assuming that
timeline.events is sorted by each event’s timestamp.
Examples:
// There should be a next event of type=’request’ after the first record.
NEXT_EVENT_SORTED(timestamp, ‘request’) != []
// First record is the next event from itself - 1 millisecond.
NEXT_EVENT_SORTED(timestamp-1, type) = ${@timeline.events[0]}
@param current_time Timestamp in milliseconds.
@param filter_type String denoting the event type, e.g., ‘request’.
@return next_event A single event.
*/
function next_event_sorted(current_time, filter_type) {
filtered_events = FILTER(timeline.events,
(x) -> x.type=filter_type);
time_filtered_events = FILTER(filtered_events,
(x) -> x.timestamp > current_time);
// Return the first event in the list, since events are sorted by timestamp. No tie breaker.
next_event = time_filtered_events[0];
next_event
} |
DOUBLE |
/**
Rename to_double() as just double().
@param x Scalar or string.
@return y Numeric scalar.
*/
function double(x) {
TO_DOUBLE(x)
} |
RUNIF |
/**
Draw a random uniform with location and scale parameters:
(0,1)*scale + location.
@param location Scalar.
@param scale Scalar.
@return y Random uniform draw from (0,1)*scale + location.
*/
function runif(location, scale) {
RANDOM() * scale + location
} |
OCTAL_INT |
/**
Rename to_int/int32() as octal_int() because leading zeros in strings are treated as
octal: TO_INT(‘08’) is invalid, TO_INT(‘010’) = 8 base 10.
Examples:
OCTAL_INT(‘08’) = null // Invalid octal.
OCTAL_INT(‘010’) = 8 // Octal ‘010’ is 8 base 10.
OCTAL_INT(‘027’) = 23 // Octal ‘027’ is 2POW(8,1)+7POW(8,0) = 23 base 10.
OCTAL_INT(‘0127’) = 1POW(8,2)+2POW(8,1)+7*POW(8,0) // Octal ‘0127’ is 87 base 10.
OCTAL_INT(‘0127’) = TO_INT(‘0127’) // Alias
@param x Scalar or string.
@return y Integer.
*/
function octal_int(x) {
TO_INT(x)
} |
AVG_MAP |
/**
Return the average of the values of a map of key=value
pairs.
Examples:
my_map = JSON_TO_MAP(‘{“key1”:10,”key2”:2}’);
AVG_MAP(my_map) = 'avg:6.0'
my_list = [1,1,2,2,3,3,3,3];
my_map = COUNT_UNIQ(INT_LIST(my_list));
AVG_MAP(my_map) = 'avg:2.6666666666666665'
@param my_map A map, e.g., {cat=1, num=2}.
@return kv_string A key-value pair as a string, e.g., num:2.
*/
function avg_map(my_map) {
// AVG(my_list) // Works for lists.
// AVG(my_map) // Doesn’t work for maps.
my_map_sum = SUM_MAP(my_map);
CONCAT_WS(‘:’, ‘avg’, PARSE_KV_NUM(my_map_sum) / SIZE(my_map))
} |
COUNT_NS |
/**
Return the number of elements in my_list, ignoring nulls.
Examples:
COUNT_NS([null,null,2,2,2,4,4,4]) = 6
COUNT_NS([null]) = 0
COUNT_NS([‘’]) = 1
COUNT_NS([]) = 0
COUNT_NS([‘null’]) = 1
@param my_list A list, e.g., [null,null,2,2,2,4,4,4].
@return count The number of non-null elements of my_list.
*/
function count_ns(my_list) {
SIZE(DROP_NULLS(my_list))
} |
ROUND10 |
/**
Round x to the base_10_decimal_place.
Examples:
ROUND10(100*PI(),-1) = 310.0
ROUND10(100*PI(),3) = 314.159
@param x Scalar.
@param base_10_decimal_place Decimal place to round, e.g., -2 (100s), -1 (10s), 0 (1s), 1 (0.1s), 2 (0.01s).
@return x_rounded x rounded to base_10_decimal_place.
*/
function round10(x, base_10_decimal_place) {
round_units = POW(10, base_10_decimal_place);
ROUND(x * round_units) / round_units
} |
LOG_BIN |
/**
Scale x into powers of base. To adjust location, do pre-/post-adjustments: e.g.,
LOG_BIN() + location or LOG_BIN(x + location, base).
Examples:
LOG_BIN(1024 * 0.8, 2) = 512 // Powers of 2.
LOG_BIN(1024 * 0.8, 4) = 256 // Powers of 4.
LOG_BIN(1024 * 0.8, 10) = 100 // Powers of 10.
@param x Scalar.
@param base Scalar.
@return x_bin The log-bin that x falls into in units of x.
*/
function log_bin(x, base) {
POW(base, FLOOR(LOG(x, base)))
} |
QUANTILE |
/**
Retrieve the quantile value for which p proportion of my_list
is smaller / ranked / sorted lower.
Examples:
QUANTILE(RANGE(90,101), 0.5) = 95 // Median between 90 and 100.
median = QUANTILE(RANDOM_LIST(500), 0.5); BETWEEN(median,0.4,0.6) = 1
@param my_list A list.
@param p Probability in [0,1] at which to retrieve the quantile. 0=MIN,
1=MAX.
@return q_value The quantile value corresponding to p.
*/
function quantile(my_list, p) {
num_elements = SIZE(my_list) - 1;
q = SORT(my_list)[FLOOR(p * num_elements)];
q
} |
LIST_SUBTRACT |
/**
Subtract list1 and list2 by removing all elements of
list1 having values in list2.
Examples:
LIST_SUBTRACT(RANGE(-3,3), RANGE(-1,2)) = [-3,-2,2]
@param list1 A list.
@param list2 List against which each element of list1 will be checked.
@return my_intersect_list list1 with list2’s common values removed.
*/
function list_subtract(list1, list2) {
// Quadratic list search without sorting.
// Take each element of list1 and count how many times that
// same element shows up in list2.
FILTER(list1,
(x) -> SIZE(FILTER(list2,
(y) -> x = y)) = 0)
} |
LIST_REMOVE |
/**
Remove the elements of my_list at each of indices,
pulling subsequent elements up by one (-1) index for each
element above them removed.
Examples:
LIST_REMOVE(RANGE(-3,3), [1,3]) = [-3,-1,1,2]
@param my_list A list.
@param indices List of indices to remove.
@return my_remove_list my_list with the elements at indices removed.
*/
function list_remove(my_list, indices) {
idx = RANGE(0,SIZE(my_list));
// Could use the REDUCE() for-loop if indices is sorted.
include_indices = LIST_SUBTRACT(idx,indices);
LIST_SUB(my_list, include_indices)
} |
LIST_MULTIPLY |
/**
Elementwise list multiplication: list1 * list2.
Examples:
LIST_MULTIPLY([1,-1],[1,2]) = [1,-2]
@param list1 A list of numbers.
@param list2 A list of numbers.
@return list_out list1 * list2
*/
function list_multiply(list1, list2) {
MAP(RANGE(0,SIZE(list1)), (i) -> list1[i] * list2[i])
} |
LIST_PLUS |
/**
Elementwise list addition/’plus’: list1 + list2.
Examples:
LIST_PLUS([1,-1],[1,1]) = [2,0]
@param list1 A list of numbers.
@param list2 A list of numbers.
@return list_out list1 + list2
*/
function list_plus(list1, list2) {
MAP(RANGE(0,SIZE(list1)), (i) -> list1[i] + list2[i])
} |
BAYESIAN_BOOTSTRAP |
/**
Use MD5() to map the input string cluster_var into a uniform random number.
Transform that uniform random number into a random exponential to create a random
weight for the Bayesian bootstrap.
Using an identifier such as timeline.id approximates the statistical behavior
of using clustered standard errors or a block-bootstrap.
Examples:
SET_PROPERTY(timeline,’id’,’my_timeline_id’); // For a deterministic example.
// bootstrap_cluster_var is defined by "cluster_var_expression": "timeline.id"
cluster_var = timeline.id; bootstrap_id = 3;
-
BAYESIAN_BOOTSTRAP(cluster_var, 'bootstraps', bootstrap_id) = 0.4263795482392406
-
// -1 * LOG(1 - MD5_MOD(CONCAT('string', bootstrap_id, cluster_var), 1000000) / 1000000 + 0.0000005, EXP(1))
@param cluster_var An input string, e.g., timeline.id.
@param seed String to prepend to cluster_var prior to hashing in order to customize
(synchronize or make independent) the randomization for the application/model/etc.
@param bootstrap_id Bootstrap run identifier. Typically 1 through the number of bootstraps.
@return weight_multiplier A single, MD5-hash-produced, and exponentially-distributed random number.
*/
function bayesian_bootstrap(cluster_var, seed, bootstrap_id) {
// Hash string.
md5_string = CONCAT(seed, bootstrap_id, cluster_var);
// 6 digits of precision for the uniform hash.
uniform_random_hash = MD5_MOD(md5_string, 1000000) / 1000000;
// Input range to the exponential inverse CDF: 0.0000005 to 0.9999995, by 1e-6.
weight_multiplier = INV_EXP(uniform_random_hash + 0.0000005);
weight_multiplier
} |
WEAKLY_BETWEEN |
/**
If start <= x <= end, return 1. Otherwise return 0.
Examples:
start = 700; end = -700; // Days
reference_time = MILLIS(TO_DATETIME('2022-01-01 9:00:00.000'));
// Replace this expression...
IF(reference_time - end*24*3600*1000 <= timestamp
AND timestamp <= reference_time - start*24*3600*1000, 1, 0)
-
// With this more readable expression using the UDF.
BETWEEN(reference_time - timestamp,
-
-
@param x Scalar.
@param start Scalar.
@param end Scalar.
@return is_x_weakly_between IF(start <= x <= end, 1, 0).
*/
function weakly_between(x, start, end) {
IF(start <= x AND x <= end, 1, 0)
} |
ROUND_BASE |
/**
Round x to the decimal_place in base; return the result in base 10.
Examples:
ROUND_BASE(57,2,-1) = 58.0 // Round to the nearest 2.
ROUND_BASE(57,5,-1) = 55.0 // Round to the nearest 5.
ROUND_BASE(57,5,-2) = 50.0 // Round to the nearest 25.
ROUND_BASE(57.25,5,1) = 57.2 // Round to the nearest 1/5.
ROUND_BASE(100*PI(),10,-1) = 310.0
ROUND_BASE(100*PI(),10,3) = 314.159
@param x Scalar.
@param base Numeric base for rounding.
@param decimal_place Decimal place to round, e.g., -2 (100s), -1 (10s), 0 (1s), 1 (0.1s), 2 (0.01s),
where decimal place means base^(-1*decimal_place).
@return x_rounded x rounded to decimal_place in base, returned in base 10.
*/
function round_base(x, base, decimal_place) {
round_units = POW(base, decimal_place);
ROUND(x * round_units) / round_units
} |
KTH_INDEX |
/**
Return the k^th index of my_list. Designed to handle nulls and empty lists.
k^th index, k in [0, SIZE() - 1]
Same as just using k, but protects against empty lists/nulls/out-of-range indices.
Returns null because LIST_VALUE([], null) returns null, but LIST_VALUE([],0) does not
(bugfix in progress).
Examples:
KTH_INDEX([-2,0,3,10,20],0) = 0
KTH_INDEX([-2,0,3,10,20],3) = 3
KTH_INDEX([],0) = null
KTH_INDEX(null,0) = null
@param my_list A list.
@param k The index to confirm.
@return kth_index The index of the k^th element, if any.
*/
function kth_index(my_list, k) {
list_size = SIZE(my_list);
IF(0 <= k AND k < list_size, k, null) // Start at 0 for the first element.
} |
FIRST_INDEX |
/**
Return the first index of my_list. Designed to handle nulls and empty lists.
Examples:
FIRST_INDEX([-2,0,3,10,20]) = 0
FIRST_INDEX([]) = null
FIRST_INDEX(null) = null
@param my_list A list.
@return first_index The index of the first element, if any.
*/
function first_index(my_list) {
// Maybe there could be a special notation for last? (first=’min’, last=’max’?)
// E.g., \({@my_list[0]} = my_list[0] // If SIZE(my_list) > 0.
// \){@my_list[‘min’]} = my_list[FIRST_INDEX(my_list)].
// ${@my_list[‘max’]} = my_list[LAST_INDEX(my_list)].
KTH_INDEX(my_list, 0)
} |
ALL |
/**
Return TRUE if all elements of my_list are truthy.
Examples:
ALL([0,10,2,-4.3]) = FALSE
ALL([1,3,4,5,2,-4]) = TRUE
ALL([0]) = FALSE
ALL([‘tRuE’]) = TRUE
ALL([‘hi’]) = FALSE // All other strings are false.
ALL([null]) = FALSE
ALL([]) = TRUE // Vacuously true.
ALL(null) = null
@param my_list List, e.g. [0,10,2,-4.3].
@return all_true Boolean
*/
function all(my_list) {
all_true = True;
out = MAP(my_list,
(x) -> all_true = all_true AND x;
all_true);
// Always return (vacuously) True for empty lists, [].
IF(SIZE(my_list)=0, True, out[-1])
} |
LN |
/**
Return the natural log, log base e of input x. Alias for NATLOG().
Examples:
LN(1) = 0
LN(EXP(1)) = 1
@param x Numeric scalar in (0, infinity).
@return ln_x Numeric scalar.
*/
function ln(x) {
natLog(x)
} |
EVENTS |
/**
Return a list of all events of a given event_type with no time filtering.
See other UDFs like WINDOW_EVENTS() for time-based filtering.
Examples:
EVENTS(‘request’) != [] // There is at least one event of type ‘request’.
EVENTS(‘INVALID_TYPE’) = [] // There are none of type ‘INVALID_TYPE’.
@param event_type String denoting the event type, e.g., ‘request’.
@return events_of_type A list of events of type = event_type.
*/
function events(event_type) {
FILTER(timeline.events, (e) -> e.type=event_type)
} |
STR_WHICH |
/**
Return a list of indices of my_list elements that are not equal to the empty
string ‘’.
Examples:
STR_WHICH([‘’,’hi’,’’,’bye’]) = [1,3]
STR_WHICH([0,’hi’,FALSE,’bye’,null,-1,’’,3]) = [1,3,5,7]
@param my_list List of strings, e.g. [‘’,’hi’,’’,’bye’].
@return indices List of indices of non-empty-string elements, e.g., [1,3].
*/
function str_which(string_list) {
WHICH(MAP(string_list,
(x) -> IF(x!=’’ AND x!=FALSE AND x!=0 AND x!=null, TRUE, FALSE)))
} |
ROUND_DIGIT |
/**
Round x to the base_10_decimal_place.
Examples:
ROUND_DIGIT(100*PI(),-1) = 310.0
ROUND_DIGIT(100*PI(),3) = 314.159
@param x Scalar.
@param base_10_decimal_place Decimal place to round, e.g., -2 (100s), -1 (10s), 0 (1s), 1 (0.1s), 2 (0.01s).
@return x_rounded x rounded to base_10_decimal_place.
*/
function round_digit(x, base_10_decimal_place) {
round_units = POW(10, base_10_decimal_place);
ROUND(x * round_units) / round_units
} |
REPLACE |
/**
Rename REGEX_REPLACE() as just REPLACE().
@param my_string String.
@param replacements List of replacement string pairs, alternating between regular expressions and their
substitutions, e.g., [’ +’,’space’,’[0-9]’,’digit’], applied first to last.
@return y String with subtitutions made.
*/
function replace(my_string, replacements) {
REGEX_REPLACE(my_string, replacements)
} |
LIST_RANGE |
/**
Subset my_list to a range of indices from start_index to end_index,
excluding end_index.
Uses RANGE() function, so the indices are Pythonic: 0, …, SIZE() - 1.
Recycles based on the size of the list using the modulus % operator. Consider using
my_list[RANGE(start,end)] for subsetting without recycling.
Examples:
LIST_RANGE([‘a’,’b’,’c’,’d’],2,4) = [‘c’,’d’]
LIST_RANGE(RANGE(-2,2),1,3) = [-1,0]
LIST_RANGE(RANGE(0,10),3,7) = [3,4,5,6]
LIST_RANGE(RANGE(0,10),-7,-3) = [3,4,5,6]
LIST_RANGE(RANGE(0,10),7,3) = []
LIST_RANGE(RANGE(0,10),3,17) = [3,4,5,6,7,8,9,0,1,2,3,4,5,6]
// LIST_SUB() allows negative indices but does not recycle.
LIST_SUB([‘a’,’b’],RANGE(-4,4)) = [null, null, ‘a’, ‘b’, ‘a’, ‘b’, null, null]
// LIST_RANGE() allows negative indices and recycles.
LIST_RANGE([‘a’,’b’],-4,4) = [‘a’, ‘b’, ‘a’, ‘b’, ‘a’, ‘b’, ‘a’, ‘b’]
LIST_RANGE([‘a’,’b’,’c’,’d’],2,’4’) = [‘c’,’d’]
LIST_RANGE([‘a’,’b’,’c’,’d’],’c’,4) = null
LIST_RANGE([‘a’,’b’,’c’,’d’],null,null) = []
LIST_RANGE([‘a’,’b’,’c’,’d’],null,4) = []
LIST_RANGE([‘a’,’b’,’c’,’d’],2,null) = []
LIST_RANGE([],2,4) = [null, null] // TODO: null or []. Is this desirable?
LIST_RANGE(null,2,4) = null // TODO: null or []. Is this desirable?
LIST_RANGE([null],2,4) = [null, null] // Recycling.
LIST_RANGE([null],0,1) = [null]
@param my_list A list.
@param start_index Integer.
@param end_index Integer. Should be > start_index.
@return list_subset List with only elements in indices.
*/
function list_range(my_list, start, end) {
// Use RANGE() then mod % to cleanly apply recycling.
indices = COALESCE(RANGE(start, end),[]); // Invalid inputs mean empty indices.
list_size = SIZE(my_list);
recycled_indices = MAP(indices, (x) -> x % list_size);
// If the original inputs should return an empty list, create an empty list with RANGE().
// Otherwise, return the subsetted list.
IF(start >= end AND start!=null and end!=null,
RANGE(start, end),
LIST_SUB(my_list, recycled_indices))
}
// function list_range_old(my_list, start_index, end_index) {
// list_size = SIZE(my_list);
// start = start_index % list_size;
// end = (end_index - 1) % list_size; // - 1 because the end index is exclusive.
// indices = RANGE(start, end + 1); // + 1 because the end index is exclusive.
//
// // If the original inputs should return an empty list, create an empty list with RANGE().
// // Otherwise, return the subsetted list.
// IF(start_index > end_index,
// RANGE(start_index, end_index),
// LIST_SUB(my_list, indices))
// } |
LOG_ROUND_BASE |
/**
Round the logarithm of x in base to the decimal_place; return the rounded,
base-exponentiated result in base 10:
base^ROUND_BASE(log(x,base),base,decimal_place).
This is especially useful for creating steps that are logarithmic and exponential while
preserving the original units of highly skewed data.
Examples:
// Convert 9 to base 2, round to -1 (‘twos’) digit, convert back to base 10.
// 9, round to the nearest ‘twos’ power of 2 –> 9 = 2^3.x –> 4 –> 2^4 –> 16.
LOG_ROUND_BASE(9,2,-1) = 16
MAP([5, 9, 27, 62], (x) -> TO_INT(LOG_ROUND_BASE(x,2,-1))) = [4, 16, 16, 64]
LOG_ROUND_BASE(120,10,0) = POW(10,2) // (rounding the exponent 2.1 –> 2)
LOG_ROUND_BASE(1000000,10,-1) = POW(10,10) // (rounding the exponent 6 –> 10)
MAP(RANGE(1,9), (x) -> ROUND10(LOG_ROUND_BASE(x,2,1),3))
= [1.0, 2.0, 2.828, 4.0, 5.657, 5.657, 8.0, 8.0]
@param x Scalar.
@param base Numeric base for rounding.
@param decimal_place Decimal place to round, e.g., -2 (100s), -1 (10s), 0 (1s), 1 (0.1s), 2 (0.01s),
where decimal place means base^(-1*decimal_place).
@return x_rounded x rounded to decimal_place in base, returned in base 10.
*/
function log_round_base(x, base, decimal_place) {
POW(base, ROUND_BASE(LOG(x,base), base, decimal_place))
} |
HISTOGRAM |
/**
Compute a histogram bin for my_list: count the number of elements
of my_list between two elements of knots. index and
cumulative_counts determine which two knots to use. return_counts
determines whether to return counts or a rate = counts / width between
the two knots.
Examples:
my_list = [1,2,3,4,7,10,12]; // Numerical list.
index = 0; // Do loops from 0 to SIZE(knots) - 2 (Which bin to count?)
-
cumulative_counts = FALSE; // Count always from knot[0] or knot[index] to knot[index+1]?
return_counts = FALSE; // Counts or time rates (divide by duration)?
HISTOGRAM(my_list, index, knots, cumulative_counts, return_counts) = 4/7
my_list = [1,2,3,4,7,10,12]; // Numerical list.
index = 1; // Do loops from 0 to SIZE(knots) - 2 (Which bin to count?)
-
cumulative_counts = TRUE; // Count always from knot[0] or knot[index] to knot[index+1]?
return_counts = FALSE; // Counts or time rates (divide by duration)?
HISTOGRAM(my_list, index, knots, cumulative_counts, return_counts) = 7/14
@param my_list A numeric list.
@param index Knot index/bin to be evaluated, e.g., from 0 to SIZE(knots) - 2.
@param knots List of knots to be considered, e.g., [0, 7].
@param cumulative_counts Count from knot[0] (TRUE) or knot[index] (FALSE) to knot[index+1]?
@param return_counts Counts (TRUE) or time rates / divide by duration (FALSE)?
@return y Scalar count or rate, depending on counts.
*/
function histogram(my_list, index, knots, cumulative_counts, return_counts) {
// my_list = [1,2,3,4,10,12]; // Numerical list.
// index = 0; // Do loops from 0 to SIZE(knots) - 2 (Which bin to count?)
// knots = [0, 7]; // In days.
// cumulative_counts = FALSE; // Count always from knot[0] or knot[index] to knot[index+1]?
// return_counts = FALSE; // Counts or time rates (divide by duration)?
window_start = IF(cumulative_counts,
knots[0],
knots[index]);
window_end = knots[index + 1];
// ‘Duration’ is just the difference between knots.
window_duration = window_end - window_start;
count_between_knots = HISTOGRAM_BIN(my_list, window_start, window_end);
elements_per_duration = count_between_knots / window_duration;
// Return the counts or rate of events per day.
IF(return_counts, count_between_knots, elements_per_duration)
} |
LIST_APPEND_VALUE |
/**
Append value to my_list (at the end).
Examples:
LIST_APPEND_VALUE(RANGE(-3,3), 100) = [-3,-2,-1,0,1,2,100]
@param my_list A list.
@param value Value to be appended to my_list.
@return my_longer_list my_list with value appended.
*/
function list_append_value(my_list, value) {
LIST_INSERT_VALUE(my_list, value, SIZE(my_list))
} |
LIST_INTERSECT |
/**
Intersect list1 and list2 by retaining all elements of
list1 having values in list2.
Examples:
LIST_INTERSECT(RANGE(-3,3), RANGE(-1,2)) = [-1,0,1]
@param list1 A list.
@param list2 List against which each element of list1 will be checked.
@return my_intersect_list list1 with list2’s common values retained.
*/
function list_intersect(list1, list2) {
// Quadratic list search without sorting.
// Take each element of list1 and count how many times that
// same element shows up in list2.
FILTER(list1,
(x) -> SIZE(FILTER(list2,
(y) -> x = y)) > 0)
} |
LIST_DIVIDE |
/**
Elementwise list division: list1 / list2.
Examples:
LIST_DIVIDE([1,-1],[1,2]) = [1,-1/2]
@param list1 A list of numbers.
@param list2 A list of numbers.
@return list_out list1 / list2
*/
function list_divide(list1, list2) {
MAP(RANGE(0,SIZE(list1)), (i) -> list1[i] / list2[i])
} |
SWITCH |
/**
Obtain the index for which x = cases[index].
Return the value of outputs[index].
Examples:
SWITCH(3, [1,2,3], [‘a’,’b’,’c’]) = ‘c’
@param x A value to compare with the list of cases.
@param cases A list of values to compare to x.
@param outputs A list of outputs.
@return x_output The output value for x’s case.
*/
function switch(x, cases, outputs) {
// Cases list should match the type of x.
// Outputs list should have a common output type.
// Defaults to the last value of outputs.
// ENUM_BUCKET() in a UDF causes this the fail (bugfix,
// similar to MAP_TO_LIST() in progress). This logic is
// valid and functional, though.
// outputs[ENUM_BUCKET(x, cases, SIZE(outputs) - 1)]
// Probably slower.
idx = RANGE(0,SIZE(cases));
// Using WHICH().
// matched_index = COALESCE(WHICH(MAP(cases,
// (c) -> x=c))[0],
// SIZE(outputs) - 1);
matched_index = COALESCE(FILTER(idx, (i) -> x=cases[i])[0],
SIZE(outputs) - 1);
outputs[matched_index]
} |
RANGE_N |
/**
Return a list of n evenly numbers from start to end (inclusive).
Examples:
RANGE_N(-2,8,6) = [-2,0,2,4,6,8]
@param start Numeric scalar, e.g., -2.
@param end Numeric scalar, e.g., 8.
@param n Number of evenly spaced numbers to return.
@return range_list Numeric list, e.g., [-2,0,2,4,6,8].
*/
function range_n(start, end, n) {
idx = RANGE(0,n);
step = (end - start) / (n - 1); // n-1 spaces between [start, end].
range_list = MAP(idx,
(x) -> x * step + start);
range_list
} |
AVG_NS |
/**
Return the mean of my_list, ignoring nulls.
Examples:
AVG_NS([null,null,2,2,2,4,4,4]) = 3
AVG_NS(null) = null
TRUE // TODO: null. AVG_NS([null]) = NaN // TODO: null. Divide by zero?
AVG_NS([‘’]) = null
AVG_NS([‘c’]) = null
AVG_NS([‘c’,1]) = null // TODO: 1. Ignore strings.
TRUE // TODO: NaNs. AVG_NS([]) = NaN // TODO: null. Divide by zero?
@param my_list A list, e.g., [null,null,2,2,2,4,4,4].
@return mean The mean of non-null elements of my_list.
*/
function avg_ns(my_list) {
AVG(DROP_NULLS(my_list))
} |
AG_MAX_NS |
/**
Return the maximum value of my_list, ignoring nulls.
Examples:
AG_MAX_NS([null,null,2,2,2,4,4,4]) = 4
AG_MAX_NS(null) = null
AG_MAX_NS([null]) = null
AG_MAX_NS([‘’]) = null
AG_MAX_NS([‘c’]) = null
AG_MAX_NS([‘c’,1]) = 1
AG_MAX_NS([]) = null
@param my_list A list, e.g., [null,null,2,2,2,4,4,4].
@return max The maximum value of non-null elements of my_list.
*/
function ag_max_ns(my_list) {
AG_MAX(DROP_NULLS(my_list))
} |
MODE_IMPUTE |
/**
Replace all null values of my_list with the mode of the
non-null elements.
Examples:
MODE_IMPUTE([null,null,2,2,4]) = [2,2,2,2,4]
@param my_list A list, e.g., [null,null,2,2,4].
@return my_imputed_list A list with nulls imputed using the mode.
*/
function mode_impute(my_list) {
mode = MODE_NS(my_list)[0]; // Extract the first mode, in case of ties.
MAP(my_list, (x) -> COALESCE(x,mode))
} |
MAP_APPEND |
/**
Append a key:value pair to my_map.
Note: Keys and values cannot have commas ‘,’, equals ‘=’, or braces ‘{’ and ‘}’ due to
JSON parsing.
Examples:
MAP_APPEND({‘v’:1},’v0’,1) = {‘v0’:1,’v’:1}
MAP_APPEND(null,’v’,1) = {‘v’:1}
MAP_APPEND({},’v’,1) = {‘v’:1}
my_map = JSON_TO_MAP(‘{“key1”:”value1”,”key2”:”value2”}’);
MAP_APPEND(my_map, 'key3', 123.0)
= {'key1':'value1', 'key2':'value2', 'key3':123.0}
@param my_map A map, e.g., {cat=1, num=2}.
@return my_appended_map my_map with the key:value pair appended.
*/
function map_append(my_map, key, value) {
MAP_UNION(my_map, {key:value})
} |
KTH_LAST_INDEX |
/**
Return the k^th-from-last index of my_list. Designed to handle nulls and empty
lists.
k^th index, k in [0, SIZE() - 1]
Same as just using k, but protects against empty lists/nulls/out-of-range indices.
Returns null because LIST_VALUE([], null) returns null, but LIST_VALUE([],0) does not
(bugfix in progress).
Examples:
KTH_LAST_INDEX([-2,0,3,10,20],3) = 1
KTH_LAST_INDEX([],0) = null
KTH_LAST_INDEX(null,0) = null
@param my_list A list.
@param k The index to confirm.
@return kth_last_index The index of the k^th element, if any.
*/
function kth_last_index(my_list, k) {
// k^th from last index, k in [0, SIZE() - 1]
list_size = SIZE(my_list);
index = list_size - 1 - k; // Start at 0 for the last element.
// Returns null because LIST_VALUE([], null) returns null, but LIST_VALUE([],0) does not (bugfix in progress).
IF(0 <= index AND index < list_size, index, null)
} |
LAST_INDEX |
/**
Return the last index of my_list. Designed to handle nulls and empty lists.
Examples:
LAST_INDEX([-2,0,3,10,20]) = 4
LAST_INDEX([]) = null
LAST_INDEX(null) = null
@param my_list A list.
@return last_index The index of the last element, if any.
*/
function last_index(my_list) {
// Maybe there could be a special notation for last? (first=’min’, last=’max’?)
// E.g., \({@my_list[0]} = my_list[0] // If SIZE(my_list) > 0.
// \){@my_list[‘min’]} = my_list[FIRST_INDEX(my_list)].
// ${@my_list[‘max’]} = my_list[LAST_INDEX(my_list)].
KTH_LAST_INDEX(my_list, 0)
} |
PERCENTILE |
/**
Retrieve the quantile value for which p proportion of my_list
is smaller / ranked / sorted lower.
Examples:
PERCENTILE(RANGE(90,101), 50) = 95 // Median between 90 and 100.
median = PERCENTILE(RANDOM_LIST(500), 50); BETWEEN(median,0.4,0.6) = 1
@param my_list A list.
@param percent Number between 0 and 100.
@return percentile The value corresponding to the percent-ile of my_list.
*/
function percentile(my_list, percent) {
// my_list : Numerical list.
// percent : Number between 0 and 100.
QUANTILE(my_list, percent / 100)
} |
INT |
/**
Rename to_int/int32() and strip out any leading zeros because leading zeros in strings
are treated as octal: TO_INT(‘08’) is invalid, TO_INT(‘010’) = 8 base 10. This behavior
is an artifact of Java’s casting of strings to integers.
Examples::
INT(0) = 0
INT(‘0’) = 0
INT(‘000’) = 0
INT(‘08’) = 8
INT(‘010’) = 10
INT(‘027’) = 27
INT(‘0127’) = 127
INT(‘0-1’) = -1
INT(‘00-001’) = -1 // Strip leading zeros, TO_INT(‘-001’) = -1.
@param x Scalar or string.
@return y Integer.
*/
function int(x) {
// Strip out leading zeros to avoid any octal issues.
IF(TO_INT(x)!=0, TO_INT(REGEX_REPLACE(TO_STRING(x),[‘^0+’,’’])), 0)
} |
LIST_REPLACE_VALUE |
/**
Replace the value of my_list at index.
Examples:
LIST_REPLACE_VALUE(RANGE(-3,3), 100, 3) = [-3,-2,-1,100,1,2]
LIST_REPLACE_VALUE([], null, 0) = []
LIST_REPLACE_VALUE([], ‘’, 0) = []
LIST_REPLACE_VALUE([null], ‘’, 0) = [‘’]
LIST_REPLACE_VALUE([‘’], null, 0) = [null]
LIST_REPLACE_VALUE([‘’], ‘’, 0) = [‘’]
LIST_REPLACE_VALUE([0,1], null, 0) = [null, 1]
LIST_REPLACE_VALUE([], ‘a’, 0) = []
LIST_REPLACE_VALUE([], [], 0) = []
@param my_list A list.
@param value Replacement value.
@param index Index that value will replace.
@return my_replace_list my_list with the value of at index replaced by value.
*/
function list_replace_value(my_list, value, index) {
list_size = SIZE(my_list);
indices = RANGE(0,list_size);
MAP(indices, (i) -> IF(i = index, value, my_list[i]))
} |
IS_NULL |
/**
Test whether x is null. (Warning: TO BE DEPRECATED. Use =null instead.)
Examples:
IS_NULL(null) = TRUE
IS_NULL(2) = FALSE
IS_NULL(‘hi’) = FALSE
IS_NULL([null,null,2,2,4,4]) = FALSE
@param x An object.
@return is_null A boolean denoting whether x is null.
*/
function IS_NULL(x) {
// Warning: TO BE DEPRECATED. Use =null instead.
x = null
} |
NEXT |
/**
Return the next event with the same type as my_event.
Examples:
NEXT(self) != self // The next event is not the current event.
NEXT(timeline.events[0]) = NEXT_EVENT_(timeline.events[0].timestamp, timeline.events[0].type)
@param my_event An event from timeline.events or the current in-focus event, event / self.
@return next_event The next event of the same type as my_event.
*/
function next(my_event) {
NEXT_EVENT_(my_event.timestamp, my_event.type)
}
// Future version.
// function next(my_event) {
// idx = FILTER(RANGE(my_event.index + 1, SIZE(timeline.events)),
// (i) -> timeline.events[i].type = my_event.type,
// (n,x) -> n > 1)[0]
// timeline.events[idx]
// } |
MOST_RECENT_SORTED |
/**
Find and return the most recent event of a given type, assuming that
timeline.events is sorted by each event’s timestamp.
Examples:
// No recent records from the first record.
MOST_RECENT_SORTED(timestamp, ‘request’) = null
// First record is the most recent from itself + 1 millisecond.
MOST_RECENT_SORTED(timestamp+1, type) = ${@timeline.events[0]}
@param current_time Timestamp in milliseconds.
@param filter_type String denoting the event type, e.g., ‘request’.
@return event_list_including_ties A single event.
*/
function most_recent_sorted(current_time, filter_type) {
time_filtered_events = FILTER(timeline.events,
(x) -> x.timestamp < current_time AND x.type=filter_type,
(n,x) -> x.timestamp >= current_time);
// Return the last event in the list, since events are sorted by timestamp. No tie breaker.
last_event = time_filtered_events[-1];
last_event
} |
BUILT_IN_TESTS |
/**
A list of test cases for UDF built-in functions dependencies. Add to the
list of examples in the documentation below.
Examples:
// Add UDF-related built-in test cases here.
BUILT_IN_TESTS()
current = timestamp; SET_PROPERTY(event, ‘timestamp’, 0); current != 0
current = timestamp; SET_PROPERTY(event, ‘timestamp’, 0); event.timestamp = 0
current = timestamp; SET_PROPERTY(event, ‘timestamp’, null); current != null
current = timestamp; SET_PROPERTY(event, ‘timestamp’, null); event.timestamp != null // TODO: Should = null
SUM(null) = null
SUM([null]) = 0
// TODO: FAILS: SUM([‘’]) // TODO: 0. Non-numeric strings should be ignored.
// TODO: FAILS: SUM([‘c’]) // TODO: 0. Non-numeric strings should be ignored.
// TODO: FAILS: SUM([‘c’,1]) // TODO: 1. Non-numeric strings should be ignored.
SUM([]) = 0
SUM([‘2’]) = 2
AVG(null) = null
// AVG([null]) = NaN // TODO: null. Divide by zero? = TRUE
// TODO: FAILS: AVG([‘’]) // TODO: 0. Non-numeric strings should be ignored.
// TODO: FAILS: AVG([‘c’]) // TODO: 0. Non-numeric strings should be ignored.
// TODO: FAILS: AVG([‘c’,1]) // TODO: 1. Non-numeric strings should be ignored.
// TODO: NaNs. AVG([]) = NaN // TODO: null. Divide by zero? = TRUE
AVG([‘2’]) = 2
AG_MIN(null) = null
AG_MIN([null]) = null
AG_MIN([‘’]) = null
AG_MIN([‘c’]) = null
AG_MIN([‘c’,1]) = 1 // Ignores strings.
AG_MIN([]) = null
AG_MAX(null) = null
AG_MAX([null]) = null
AG_MAX([‘’]) = null
AG_MAX([‘c’]) = null
AG_MAX([‘c’,1]) = 1 // Ignores strings.
AG_MAX([]) = null
COUNT(null) = null
COUNT([null]) = 1
COUNT([‘’]) = 1
COUNT([‘c’]) = 1
COUNT([‘c’,1]) = 2
COUNT([]) = 0
COALESCE(null,’coalesced’) = ‘coalesced’
COALESCE([null],’coalesced’) = [null]
COALESCE([‘’],’coalesced’) = [‘’]
COALESCE([],’coalesced’) = []
COALESCE(null,’coalesced’) = ‘coalesced’
COALESCE(1,’coalesced’) = 1
COALESCE(AVG([]),’coalesced’) = ‘coalesced’ // NaN coalesces.
// SIZE(1) // Invalid input: Error.
// SIZE(‘c’) // Invalid input: Error.
SIZE(null) = null
SIZE([]) = 0
SIZE([null]) = 1
SIZE([null,1]) = 2
SIZE([‘’]) = 1
SIZE([‘c’]) = 1
SIZE([‘c’,1]) = 2
function test() { IFRETURN(TRUE, ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN(FALSE, ‘early’); ‘late’} test() = ‘late’
function test() { IFRETURN(‘tRuE’, ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN(0, ‘early’); ‘late’} test() = ‘late’
function test() { IFRETURN(1, ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN(‘c’, ‘early’); ‘late’} test() = ‘late’
function test() { IFRETURN(null, ‘early’); ‘late’} test() = ‘late’
function test() { IFRETURN([], ‘early’); ‘late’} test() = ‘late’
function test() { IFRETURN([null], ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN([‘’], ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN([‘c’], ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN([‘c’,1], ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN([null], ‘early’); ‘late’} test() = ‘early’
function test() { IFRETURN([null,1], ‘early’); ‘late’} test() = ‘early’
SIZE(PARSE_KERNELS(‘0.5m,4h,1d,3d’)) = 4
SIZE(PARSE_KERNELS(‘4h, 1d’)) = 2
PARSE_KERNELS(‘1d’)[0].distribution = null
PARSE_KERNELS(‘1d-e’)[0].distribution != null // EXP is a distribution, not a string.
PARSE_KERNELS(‘1d,1d-e’) != null
PARSE_KERNELS(‘0.5m-u’) != null
PARSE_KERNELS(‘0.5m-U’) != null
TRUE // PARSE_KERNELS(‘0.5m-X’) = null // Returns failure-to-parse distribution ‘x’ warning.
FLOOR(null) = null
FLOOR(-1) = -1
FLOOR(-1.0) = -1
FLOOR(-1.01) = -2
TRUE // Fails: FLOOR(‘c’) // Warning: Null pointer exception.
TRUE // Fails: FLOOR([]) // Warning: Null pointer exception.
TRUE // Fails: FLOOR([1.1]) // Warning: Null pointer exception.
SET_PROPERTY(event, ‘timestamp’, 0); event.timestamp = 0 // Replaces with 0.
x = event.timestamp; SET_PROPERTY(event, ‘timestamp’, null); event.timestamp = x // Does nothing.
// Standard 1-parameter: categorical.
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’)[0]) = ‘c-1d-e:1’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’)[0]) = ‘c-1d-e:1’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’)) = ‘[c-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(3)) = ‘[3-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘3’)) = ‘[3-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(3.0)) = ‘[3.0-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(TO_STRING(3.0))) = ‘[3-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘:-^%&’)) = ‘[:-^%&-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘::’)) = ‘[::-1d-e:1]’
// Standard 2-parameter: categorical and numerical inputs.
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’,0)) = ‘[]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’,1)) = ‘[c-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’,-2)) = ‘[c-1d-e:-2]’
// Manipulating the ‘kernels’ background variable.
kernels=’1d-t’; TO_STRING(SUM_OPPORTUNITIES(‘c’)) = ‘[c-1d-t:1]’
// kernels=’1d-x’; TO_STRING(SUM_OPPORTUNITIES(‘c’)) = null // Warning: Null pointer exception: ‘x’ is invalid
kernels=’1d-t,1d-u’; TO_STRING(SUM_OPPORTUNITIES(‘c’)) = ‘[c-1d-t:1, c-1d-u:1]’
// Edge cases with invalid and/or unexpected inputs.
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(null)) = null
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘’)) = null
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES([null])) = ‘[[null]-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES([‘’])) = ‘[[]-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES([‘c’,1])) = ‘[[c, 1]-1d-e:1]’
kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES([])) = ‘[[]-1d-e:1]’
// kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’,null)) = null // Warning: Null pointer exception.
// kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’,’d’)) = null // Warning: Null pointer exception.
// kernels=’1d’; TO_STRING(SUM_OPPORTUNITIES(‘c’,[])) = null // Warning: Null pointer exception.
TRUE
// List subsetting and reversal.
[0,1,2][0] = 0
[0,1,2][[0]][[0]][0] = 0
// RANGE(0,3)[[0]][[0]][0] = 0 // Fails
// timeline.events[[0]][[0]][0] = timeline.events[0] // Fails
RANGE(0,3)[0] = 0
[0,10,20][[0,1,2]][0] = 0
[0,10,20][RANGE(0,3)][0] = 0
// RANGE(0,3)[[0,1,2]][0] = 0 // Fails
// RANGE(0,3)[RANGE(0,3)][0] = 0 // Fails
// timeline.events[[0,1,2]][0] = timeline.events[0] // Fails
[timeline.events[0]][0] = timeline.events[0]
LIST_REVERSE(RANGE(0,5)) = [4,3,2,1,0]
FILTER(LIST_REVERSE(RANGE(0,5)), (x) -> TRUE) = [4,3,2,1,0]
MAP(LIST_REVERSE(RANGE(0,5)), (x) -> x) = [4,3,2,1,0]
my_list = [0,1,2,3,4]; LIST_REVERSE(my_list)[my_list] = [4,3,2,1,0]
// LIST_REVERSE(timeline.events[RANGE(0,5)])[0] = timeline.events[4] // FAILS
// timeline.events[RANGE(0,5)][0] = timeline.events[0] // FAILS
// my_list = [0,1,2,3,4]; timeline.events[my_list][0] = timeline.events[0] // FAILS
// my_list = [0,1,2,3,4]; LIST_REVERSE(timeline.events)[my_list][0] = timeline.events[4] // FAILS
FILTER(LIST_REVERSE(timeline.events), (x) -> FALSE) = []
@return TRUE Returns TRUE
*/
function BUILT_IN_TESTS() {
TRUE
} |
MEAN_IMPUTE |
/**
Replace all null values of my_list with the mean of the
non-null elements.
Examples:
MEAN_IMPUTE([null,null,2,4]) = [3,3,2,4]
@param my_list A list, e.g., [null,null,2,4].
@return my_imputed_list A list with nulls imputed using the mean.
*/
function mean_impute(my_list) {
mean = AVG_NS(my_list);
MAP(my_list, (x) -> COALESCE(x,mean))
} |
BIN |
/**
Scale x into integers by dividing by width. To adjust location, do
pre-/post-adjustments: e.g., BIN() + location.
Examples:
SET_PROPERTY(‘timestamp’,1587772800000);
BIN(timestamp, 7*24*3600*1000) = 2625 // Weeks.
SET_PROPERTY(‘timestamp’,1587772800000);
BIN(timestamp, 24*3600*1000) = 18377 // Days.
SET_PROPERTY(‘timestamp’,1587772800000);
BIN(timestamp, 6*3600*1000) = 73508 // 6-hour blocks.
SET_PROPERTY(‘timestamp’,1587772800000);
BIN(timestamp, 3600*1000) = 441048 // Hours.
@param x Scalar.
@param width Scalar.
@return x_bin The integer bin that x falls into: FLOOR(x/width).
*/
function bin(x, width) {
FLOOR(x / width)
} |
LIST_HISTOGRAM |
/**
Apply WHICH_BIN() to a list and use COUNT_UNIQ() to
tabulate the results into a histogram map.
Examples:
LIST_HISTOGRAM([1,1,2,4],[-2,0,3,10,40]) = {‘0’:3, ‘3’:1, ‘-2’:0, ‘10’:0, ‘40’:0}
rng = RNG(‘test’);
my_list = MAP(RANGE(0,100), (x) -> rng.next); // Numerical
bin_edges = RANGE_STEP(0,1.001,0.1);
LIST_HISTOGRAM(my_list,bin_edges)
= {'0':5, '0.1':9, '0.2':13, '0.30000000000000004':12, '0.4':7, '0.5':12,
'0.6000000000000001':9, '0.7000000000000001':13, '0.8':6, '0.9':14, '1':0}
@param my_list A list, e.g., [1,1,2,4].
@param bin_edges A pre-sorted (increasing) numeric list, e.g., [-2,0,3,10,40].
@return my_histogram A map of {bin_edge=count} including all histogram bins.
*/
function list_histogram(my_list, bin_edges) {
obs_bin = MAP(my_list,
(x) -> WHICH_BIN(x, bin_edges));
my_histogram = COUNT_UNIQ(STRING_LIST(obs_bin));
// Include bin_edges with zero counts?
// Which bins are missing?
missing_bins = FILTER(bin_edges,
(x) -> IF(MAP_VALUE(my_histogram,TO_STRING(x)),0,1));
map_zeros = LISTS_TO_MAP(missing_bins,
MAP(missing_bins, (x) -> 0));
my_histogram = MAP_UNION(my_histogram, map_zeros);
my_histogram
} |
AG_MIN_MAP |
/**
Return the key:value pair with the minimum value as
a string, e.g., num:2.
Examples:
my_map = JSON_TO_MAP(‘{“key1”:10,”key2”:2}’);
AG_MIN_MAP(my_map) = 'key2:2'
my_list = [1,1,2,2,3,3,3,3];
my_map = COUNT_UNIQ(INT_LIST(my_list));
result = AG_MIN_MAP(my_map);
(result = '1:2' OR result = '2:2') // Random tiebreaker.
@param my_map A map, e.g., {cat=1, num=2}.
@return kv_string A key-value pair as a string, e.g., num:2.
*/
function ag_min_map(my_map) {
// AG_MAX(my_list) // Works for lists.
// AG_MAX(my_map) // Doesn’t work for maps.
map_string = MAP_TO_LIST(my_map,
(k,v) -> CONCAT_WS(‘:’,k,v));
REDUCE(map_string,
(x,z) -> IF(PARSE_KV_NUM(x) < PARSE_KV_NUM(z),
x,
z))
} |
EVENT_HISTOGRAM |
/**
Given a list of timestamps, compute the HISTOGRAM().
causmos and offset determine the reference_time to use. All times
look back (e.g., seconds/days ago). Use negative time to look forward into the
future.
Examples:
// Inputs to Set
causmos = FALSE; // Which type of model? Event or treatment effect model?
index = 0; // Loops from 0 to SIZE(knots) - 2.
knots = [0, 7]; // In days.
offset = - 60 / (24*3600); // In seconds.
cumulative = FALSE; // Count always from knot[0] or knot[index] to knot[index+1]?
counts = FALSE; // Counts or time rates (divide by duration)?
// Pre-filter the events, extract timestamps.
filtered_timestamps = MAP(FILTER(timeline.events,
(x) -> x.type = type), // Current record's type.
-
// We set the offset forward in time and the type to include the current record's
// type, so we should find at least one record in the [0,7]-day window. We then
// set counts=FALSE, so we will compute a rate = 1 record over the 7 days = 1/7.
EVENT_HISTOGRAM(filtered_timestamps, index, knots, offset,
cumulative, counts, causmos) >= 1/7
@param filtered_timestamps List of timestamps.
@param index Knot index/bin to be evaluated, e.g., from 0 to SIZE(knots) - 2.
@param knots List of knots to be considered, e.g., [0, 7] // Days.
@param offset Timestamp adjustment in days, e.g., 60 / (24*3600) for 60 seconds.
@param cumulative Count from knot[0] (TRUE) or knot[index] (FALSE) to knot[index+1]?
@param counts Counts (TRUE) or time rates / divide by duration (FALSE)?
@param causmos Use outcome.timestamp (TRUE) or timestamp (FALSE) for
the reference timestamp? Also, model type for feature: Event-level
(FALSE) or Causmos treatment model (TRUE)?)
@return y Scalar count or rate, depending on counts.
*/
function event_histogram(filtered_timestamps, index, knots, offset, cumulative, counts, causmos) {
// Given a list of timestamps, compute the HISTOGRAM().
// causmos and offset determine the reference_time to use.
//
// causmos = FALSE; // Which type of model? Event or treatment effect model?
// offset = 60 / (24*3600); // In days.
//
// index = 0; // Do loops from 0 to SIZE(knots) - 2 (Which bin to count?)
// knots = [0, 7]; // In days.
// cumulative_counts = FALSE; // Count always from knot[0] or knot[index] to knot[index+1]?
// return_counts = FALSE; // Counts or time rates (divide by duration)?
millis_day = 2436001000;
// Which type of model? Event or treatment effect model?
// Determine the source of and set the reference time for counting.
reference_time = IF(causmos,
outcome.timestamp,
timestamp) - offset * millis_day;
// Reference time is ‘How long since this event?’ or ‘This event happened X days ago.’
events_relative_days = MAP(filtered_timestamps,
(x) -> (reference_time - x)/millis_day);
HISTOGRAM(events_relative_days, index, knots, cumulative, counts)
} |
LEN |
/**
function len(my_list) {
// Alias for SIZE().
SIZE(my_list)
} |
AG_MAX_MAP |
/**
Return the key:value pair with the maximum value as
a string, e.g., num:2.
Examples:
my_map = JSON_TO_MAP(‘{“key1”:10,”key2”:2}’);
AG_MAX_MAP(my_map) = 'key1:10'
my_list = [1,1,2,2,3,3,3,3];
my_map = COUNT_UNIQ(INT_LIST(my_list));
AG_MAX_MAP(my_map) = '3:4'
@param my_map A map, e.g., {cat=1, num=2}.
@return kv_string A key-value pair as a string, e.g., num:2.
*/
function ag_max_map(my_map) {
// AG_MAX(my_list) // Works for lists.
// AG_MAX(my_map) // Doesn’t work for maps.
map_string = MAP_TO_LIST(my_map,
(k,v) -> CONCAT_WS(‘:’,k,v));
REDUCE(map_string,
(x,z) -> IF(PARSE_KV_NUM(x) > PARSE_KV_NUM(z),
x,
z))
} |
RESCALE |
/**
Transform x by location and scale parameters:
x * scale + location.
Examples:
RESCALE(5,1,2) = 11 // 5 * 2 + 1
@param x Scalar to be transformed.
@param location Scalar.
@param scale Scalar.
@return y x * scale + location.
*/
function rescale(x, location, scale) {
x * scale + location
} |
AG_MIN_NS |
/**
Return the minimum value of my_list, ignoring nulls.
Examples:
AG_MIN_NS([null,null,2,2,2,4,4,4]) = 2
AG_MIN_NS(null) = null
AG_MIN_NS([null]) = null
AG_MIN_NS([‘’]) = null
AG_MIN_NS([‘c’]) = null
AG_MIN_NS([‘c’,1]) = 1
AG_MIN_NS([]) = null
@param my_list A list, e.g., [null,null,2,2,2,4,4,4].
@return min The minimum value of non-null elements of my_list.
*/
function ag_min_ns(my_list) {
AG_MIN(DROP_NULLS(my_list))
} |
RANDOM_LIST |
/**
Create a list with n draws from RANDOM().
Examples:
SIZE(RANDOM_LIST(5)) = 5
SUM(RANDOM_LIST(5)) > 0
SUM(RANDOM_LIST(5)) < 5
@param n Integer. Number of random draws.
@return random_list List with n draws from RANDOM().
*/
function random_list(n) {
MAP(RANGE(0,n), (x) -> RANDOM())
} |
STRICTLY_BETWEEN |
/**
If start < x < end, return 1. Otherwise return 0.
Examples:
start = 700; end = -700; // Days
reference_time = MILLIS(TO_DATETIME('2022-01-01 9:00:00.000'));
// Replace this expression...
IF(reference_time - end*24*3600*1000 < timestamp
AND timestamp < reference_time - start*24*3600*1000, 1, 0)
-
// With this more readable expression using the UDF.
BETWEEN(reference_time - timestamp,
-
-
@param x Scalar.
@param start Scalar.
@param end Scalar.
@return is_x_strictly_between IF(start < x < end, 1, 0).
*/
function strictly_between(x, start, end) {
IF(start < x AND x < end, 1, 0)
} |
SPLINE |
/**
j^th-order polynomial spline at knot x_k: (x-x_k)^j * 1(x>x_k).
Example:
SPLINE(20, 40, 0) = 0 // Knot at 40.
SPLINE(20, 40, 1) = 0 // Knot at 40.
SPLINE(40, 40, 1) = 0 // Knot at 40.
SPLINE(50, 40, 0) = 1 // Knot at 40.
SPLINE(50, 40, 1) = 10 // Knot at 40.
SPLINE(50, 40, 2) = 100 // Knot at 40.
@param x Scalar.
@param knot Scalar knot/breakpoint/kink in the piecewise spline.
@param order Order/degree of polynomial spline term.
@return y Value of the spline, given x, knot, and order.
*/
function spline(x, knot, order) {
// j^th-order polynomial spline at knot x_k: (x-x_k)^j * 1(x>x_k)
IF(x > knot, POW(x - knot, order), 0)
} |
LIST_HIST |
/**
Use LIST_HISTOGRAM() with num_bins, setting the bin
range to the minimum and maximum values of my_list.
Examples:
LIST_HIST([1,1,2,4],4) = {‘1’:2, ‘2’:1, ‘3’:0, ‘4’:1}
LIST_HIST([1.0,1,2.0,4],4) = {‘1’:2, ‘2’:1, ‘3’:0, ‘4’:1}
@param my_list A list, e.g., [1,1,2,4].
@param num_bins Number of histogram bins to create.
@return my_histogram A map of {bin_edge=count} for each histogram bin.
*/
function list_hist(my_list, num_bins) {
// Evenly spaced num_bins between min and max of ‘my_list’.
min_edge = AG_MIN(my_list);
max_edge = AG_MAX(my_list);
edges = RANGE_N(min_edge,max_edge,num_bins);
my_histogram = LIST_HISTOGRAM(my_list, edges);
my_histogram
} |
LIST_CONCAT |
/**
Concatenate the elements of my_list.
Examples:
LIST_CONCAT([‘3’,’2’,’1’]) = ‘321’
LIST_CONCAT(MAP([3,2,1],(y) -> TO_INT(y))) = ‘321’
LIST_CONCAT([‘’]) = ‘’
LIST_CONCAT(null) = null
@param my_list List to be concatenated.
@return my_list_concat String with the elements of my_list concatenated.
*/
function list_concat(my_list) {
REDUCE(MAP(my_list,
(x) -> TO_STRING(x)),
(x,y) -> CONCAT(x, y))
} |
DIV2 |
/**
function div2(x) {
x / 2
} |