4b752b30f5e34e759f23b5dca76568b9

import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
BASE = '../input/godaddy-microbusiness-density-forecasting/'

def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

census = pd.read_csv(BASE + 'census_starter.csv')
train = pd.read_csv(BASE + 'train.csv')
reaveal_test = pd.read_csv(BASE + 'revealed_test.csv')
train = pd.concat([train, reaveal_test]).sort_values(by=['cfips','first_day_of_month']).reset_index()
test = pd.read_csv(BASE + 'test.csv')
drop_index = (test.first_day_of_month == '2022-11-01') | (test.first_day_of_month == '2022-12-01')
test = test.loc[~drop_index,:]

sub = pd.read_csv(BASE + 'sample_submission.csv')
coords = pd.read_csv("/kaggle/input/usa-counties-coordinates/cfips_location.csv")
print(train.shape, test.shape, sub.shape)

train['istest'] = 0
test['istest'] = 1
raw = pd.concat((train, test)).sort_values(['cfips','row_id']).reset_index(drop=True)
raw = raw.merge(coords.drop("name", axis=1), on="cfips")

raw['state_i1'] = raw['state'].astype('category')
raw['county_i1'] = raw['county'].astype('category')
raw['first_day_of_month'] = pd.to_datetime(raw["first_day_of_month"])
raw['county'] = raw.groupby('cfips')['county'].ffill()
raw['state'] = raw.groupby('cfips')['state'].ffill()
raw["dcount"] = raw.groupby(['cfips'])['row_id'].cumcount()
raw['county_i'] = (raw['county'] + raw['state']).factorize()[0]
raw['state_i'] = raw['state'].factorize()[0]
raw['scale'] = (raw['first_day_of_month'] - raw['first_day_of_month'].min()).dt.days
raw['scale'] = raw['scale'].factorize()[0]
os.environ["CUDA_VISIBLE_DEVICES"]="0"

(128535, 8) (18810, 3) (25080, 2)

raw

	index	row_id	cfips	county	state	first_day_of_month	microbusiness_density	active	istest	lng	lat	state_i1	county_i1	dcount	county_i	state_i	scale
0	0.0	1001_2019-08-01	1001	Autauga County	Alabama	2019-08-01	3.007682	1249.0	0	-86.642900	32.535142	Alabama	Autauga County	0	0	0	0
1	1.0	1001_2019-09-01	1001	Autauga County	Alabama	2019-09-01	2.884870	1198.0	0	-86.642900	32.535142	Alabama	Autauga County	1	0	0	1
2	2.0	1001_2019-10-01	1001	Autauga County	Alabama	2019-10-01	3.055843	1269.0	0	-86.642900	32.535142	Alabama	Autauga County	2	0	0	2
3	3.0	1001_2019-11-01	1001	Autauga County	Alabama	2019-11-01	2.993233	1243.0	0	-86.642900	32.535142	Alabama	Autauga County	3	0	0	3
4	4.0	1001_2019-12-01	1001	Autauga County	Alabama	2019-12-01	2.993233	1243.0	0	-86.642900	32.535142	Alabama	Autauga County	4	0	0	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
147340	NaN	56045_2023-02-01	56045	Weston County	Wyoming	2023-02-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	42	3134	50	42
147341	NaN	56045_2023-03-01	56045	Weston County	Wyoming	2023-03-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	43	3134	50	43
147342	NaN	56045_2023-04-01	56045	Weston County	Wyoming	2023-04-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	44	3134	50	44
147343	NaN	56045_2023-05-01	56045	Weston County	Wyoming	2023-05-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	45	3134	50	45
147344	NaN	56045_2023-06-01	56045	Weston County	Wyoming	2023-06-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	46	3134	50	46

147345 rows × 17 columns

for o in tqdm(raw.cfips.unique()): 
    indices = (raw['cfips'] == o) 
    tmp = raw.loc[indices].copy().reset_index(drop=True)
    var = tmp.microbusiness_density.values.copy()
    for i in range(37, 2, -1):
        thr = 0.10 * np.mean(var[:i]) 
        difa = var[i] - var[i - 1] 
        if (difa >= thr) or (difa <= -thr):              
            if difa > 0:
                var[:i] += difa - 0.003 
            else:
                var[:i] += difa + 0.003  
    var[0] = var[1] * 0.99
    raw.loc[indices, 'microbusiness_density'] = var

{"model_id":"c5336d6305e14089a90c3de2a2101eda","version_major":2,"version_minor":0}

raw

	index	row_id	cfips	county	state	first_day_of_month	microbusiness_density	active	istest	lng	lat	state_i1	county_i1	dcount	county_i	state_i	scale
0	0.0	1001_2019-08-01	1001	Autauga County	Alabama	2019-08-01	2.856021	1249.0	0	-86.642900	32.535142	Alabama	Autauga County	0	0	0	0
1	1.0	1001_2019-09-01	1001	Autauga County	Alabama	2019-09-01	2.884870	1198.0	0	-86.642900	32.535142	Alabama	Autauga County	1	0	0	1
2	2.0	1001_2019-10-01	1001	Autauga County	Alabama	2019-10-01	3.055843	1269.0	0	-86.642900	32.535142	Alabama	Autauga County	2	0	0	2
3	3.0	1001_2019-11-01	1001	Autauga County	Alabama	2019-11-01	2.993233	1243.0	0	-86.642900	32.535142	Alabama	Autauga County	3	0	0	3
4	4.0	1001_2019-12-01	1001	Autauga County	Alabama	2019-12-01	2.993233	1243.0	0	-86.642900	32.535142	Alabama	Autauga County	4	0	0	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
147340	NaN	56045_2023-02-01	56045	Weston County	Wyoming	2023-02-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	42	3134	50	42
147341	NaN	56045_2023-03-01	56045	Weston County	Wyoming	2023-03-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	43	3134	50	43
147342	NaN	56045_2023-04-01	56045	Weston County	Wyoming	2023-04-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	44	3134	50	44
147343	NaN	56045_2023-05-01	56045	Weston County	Wyoming	2023-05-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	45	3134	50	45
147344	NaN	56045_2023-06-01	56045	Weston County	Wyoming	2023-06-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	46	3134	50	46

147345 rows × 17 columns

lag = 1
raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')['microbusiness_density'].shift(lag).bfill()
raw[f'dif_{lag}'] = (raw['microbusiness_density'] / raw[f'mbd_lag_{lag}']).fillna(1).clip(0, None) - 1
raw.loc[(raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 0
raw.loc[(raw[f'microbusiness_density']>0) & (raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 1
raw[f'dif_{lag}'] = raw[f'dif_{lag}'].abs()
# raw.groupby('dcount')['dif'].sum().plot()

raw.tail()

	index	row_id	cfips	county	state	first_day_of_month	microbusiness_density	active	istest	lng	lat	state_i1	county_i1	dcount	county_i	state_i	scale	mbd_lag_1
147340	NaN	56045_2023-02-01	56045	Weston County	Wyoming	2023-02-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	42	3134	50	42	NaN
147341	NaN	56045_2023-03-01	56045	Weston County	Wyoming	2023-03-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	43	3134	50	43	NaN
147342	NaN	56045_2023-04-01	56045	Weston County	Wyoming	2023-04-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	44	3134	50	44	NaN
147343	NaN	56045_2023-05-01	56045	Weston County	Wyoming	2023-05-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	45	3134	50	45	NaN
147344	NaN	56045_2023-06-01	56045	Weston County	Wyoming	2023-06-01	NaN	NaN	1	-104.567404	43.840315	NaN	NaN	46	3134	50	46	NaN

raw['target'] = raw.groupby('cfips')['microbusiness_density'].shift(-1)
raw['target'] = raw['target']/raw['microbusiness_density'] - 1


raw.loc[raw['cfips']==28055, 'target'] = 0.0
raw.loc[raw['cfips']==48269, 'target'] = 0.0

raw['lastactive'] = raw.groupby('cfips')['active'].transform('last')

# dt = raw.loc[raw.dcount==40].groupby('cfips')['microbusiness_density'].agg('last')
# raw['lastactive'].clip(0, 8000).hist(bins=30)

raw

	index	row_id	cfips	county	state	first_day_of_month	microbusiness_density	active	istest	lng	...	state_i1	county_i1	dcount	county_i	state_i	scale	mbd_lag_1	dif_1	target	lastactive
0	0.0	1001_2019-08-01	1001	Autauga County	Alabama	2019-08-01	2.856021	1249.0	0	-86.642900	...	Alabama	Autauga County	0	0	0	0	2.856021	0.000000	0.010101	1475.0
1	1.0	1001_2019-09-01	1001	Autauga County	Alabama	2019-09-01	2.884870	1198.0	0	-86.642900	...	Alabama	Autauga County	1	0	0	1	2.856021	0.010101	0.059265	1475.0
2	2.0	1001_2019-10-01	1001	Autauga County	Alabama	2019-10-01	3.055843	1269.0	0	-86.642900	...	Alabama	Autauga County	2	0	0	2	2.884870	0.059265	-0.020489	1475.0
3	3.0	1001_2019-11-01	1001	Autauga County	Alabama	2019-11-01	2.993233	1243.0	0	-86.642900	...	Alabama	Autauga County	3	0	0	3	3.055843	0.020489	0.000000	1475.0
4	4.0	1001_2019-12-01	1001	Autauga County	Alabama	2019-12-01	2.993233	1243.0	0	-86.642900	...	Alabama	Autauga County	4	0	0	4	2.993233	0.000000	-0.008066	1475.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
147340	NaN	56045_2023-02-01	56045	Weston County	Wyoming	2023-02-01	NaN	NaN	1	-104.567404	...	NaN	NaN	42	3134	50	42	NaN	0.000000	NaN	101.0
147341	NaN	56045_2023-03-01	56045	Weston County	Wyoming	2023-03-01	NaN	NaN	1	-104.567404	...	NaN	NaN	43	3134	50	43	NaN	0.000000	NaN	101.0
147342	NaN	56045_2023-04-01	56045	Weston County	Wyoming	2023-04-01	NaN	NaN	1	-104.567404	...	NaN	NaN	44	3134	50	44	NaN	0.000000	NaN	101.0
147343	NaN	56045_2023-05-01	56045	Weston County	Wyoming	2023-05-01	NaN	NaN	1	-104.567404	...	NaN	NaN	45	3134	50	45	NaN	0.000000	NaN	101.0
147344	NaN	56045_2023-06-01	56045	Weston County	Wyoming	2023-06-01	NaN	NaN	1	-104.567404	...	NaN	NaN	46	3134	50	46	NaN	0.000000	NaN	101.0

147345 rows × 21 columns

def build_features(raw, target='microbusiness_density', target_act='active_tmp', lags = 6):
    feats = []   

    for lag in range(1, lags):
        raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')[target].shift(lag)
        raw[f'act_lag_{lag}'] = raw.groupby('cfips')[target_act].diff(lag)
        feats.append(f'mbd_lag_{lag}')
        feats.append(f'act_lag_{lag}')
        
    lag = 1
    for window in [2, 4, 6, 8, 10]:
        raw[f'mbd_rollmea{window}_{lag}'] = raw.groupby('cfips')[f'mbd_lag_{lag}'].transform(lambda s: s.rolling(window, min_periods=1).sum())        
        feats.append(f'mbd_rollmea{window}_{lag}')
    
    census_columns = list(census.columns)
    census_columns.remove( "cfips")
    
    raw = raw.merge(census, on="cfips", how="left")
    feats += census_columns
    
    co_est = pd.read_csv("/kaggle/input/us-indicator/co-est2021-alldata.csv", encoding='latin-1')
    co_est["cfips"] = co_est.STATE*1000 + co_est.COUNTY
    co_columns = [
        'SUMLEV',
        'DIVISION',
        'ESTIMATESBASE2020',
        'POPESTIMATE2020',
        'POPESTIMATE2021',
        'NPOPCHG2020',
        'NPOPCHG2021',
        'BIRTHS2020',
        'BIRTHS2021',
        'DEATHS2020',
        'DEATHS2021',
        'NATURALCHG2020',
        'NATURALCHG2021',
        'INTERNATIONALMIG2020',
        'INTERNATIONALMIG2021',
        'DOMESTICMIG2020',
        'DOMESTICMIG2021',
        'NETMIG2020',
        'NETMIG2021',
        'RESIDUAL2020',
        'RESIDUAL2021',
        'GQESTIMATESBASE2020',
        'GQESTIMATES2020',
        'GQESTIMATES2021',
        'RBIRTH2021',
        'RDEATH2021',
        'RNATURALCHG2021',
        'RINTERNATIONALMIG2021',
        'RDOMESTICMIG2021',
        'RNETMIG2021'
    ]
    raw = raw.merge(co_est, on="cfips", how="left")
    feats +=  co_columns
    return raw, feats

# Build Features based in lag of target
raw, feats = build_features(raw, 'target', 'active', lags = 9)
features = ['state_i']
features += feats
features += ['lng','lat','scale']
# print(features)
# raw.loc[raw.dcount==40, features].head(10)

Latitude and Longitude feature engineering from samu2505.

coordinates = raw[['lng', 'lat']].values

# Encoding tricks
emb_size = 20
precision = 1e6

latlon = np.expand_dims(coordinates, axis=-1)

m = np.exp(np.log(precision)/emb_size)
angle_freq = m ** np.arange(emb_size)
angle_freq = angle_freq.reshape(1,1, emb_size)
latlon = latlon * angle_freq
latlon[..., 0::2] = np.cos(latlon[..., 0::2])

def rot(df):
    for angle in [15, 30, 45]:
        df[f'rot_{angle}_x'] = (np.cos(np.radians(angle)) * df['lat']) + \
                                (np.sin(np.radians(angle)) * df['lng'])
        
        df[f'rot_{angle}_y'] = (np.cos(np.radians(angle)) * df['lat']) - \
                                (np.sin(np.radians(angle)) * df['lng'])
        
    return df

raw = rot(raw)

features += ['rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'rot_45_y']

def get_model():
    from sklearn.ensemble import VotingRegressor
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cat
    from sklearn.pipeline import Pipeline
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.impute import KNNImputer    

# we should decrease the num_iterations of catboost
    cat_model = cat.CatBoostRegressor(
        iterations=2000,
        loss_function="MAPE",
        verbose=0,
        grow_policy='SymmetricTree',
        learning_rate=0.035,
        colsample_bylevel=0.8,
        max_depth=5,
        l2_leaf_reg=0.2,
        subsample=0.70,
        max_bin=4096,
    )

    return cat_model


def base_models():
    from sklearn.ensemble import VotingRegressor
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cat
    from sklearn.pipeline import Pipeline
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.impute import KNNImputer    
    
    # 
    
    # LGBM model
    params = {
    'n_iter': 300,
    'boosting_type': 'dart',
    'verbosity': -1,
    'objective': 'l1',
    'random_state': 42,
    'colsample_bytree': 0.8841279649367693,
    'colsample_bynode': 0.10142964450634374,
    'max_depth': 8,
    'learning_rate': 0.003647749926797374,
    'lambda_l2': 0.5,
    'num_leaves': 61,
    "seed": 42,
    'min_data_in_leaf': 213}

    lgb_model = lgb.LGBMRegressor(**params)
    
    xgb_model = xgb.XGBRegressor(
    objective='reg:pseudohubererror',
    tree_method="hist",
    n_estimators=795,
    learning_rate=0.0075,
    max_leaves = 17,
    subsample=0.50,
    colsample_bytree=0.50,
    max_bin=4096,
    n_jobs=2)

    # we should decrease the num_iterations of catboost
    cat_model = cat.CatBoostRegressor(
        iterations=2000,
        loss_function="MAPE",
        verbose=0,
        grow_policy='SymmetricTree',
        learning_rate=0.035,
        colsample_bylevel=0.8,
        max_depth=5,
        l2_leaf_reg=0.2,
        subsample=0.70,
        max_bin=4096,
    )
    
    models = {}
    models['xgb'] = xgb_model
    models['lgbm'] = lgb_model
    models['cat'] = cat_model

    return models

ACT_THR = 150
MONTH_1 = 20
MONTH_last = 40

raw['k'] = 1.
raw['microbusiness_density'].fillna(0, inplace = True)


TS = 39
print(f'TS: {TS}')

train_indices = (raw.istest==0) & (raw.dcount  < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR) 
valid_indices = (raw.istest==0) & (raw.dcount == TS)

# model = get_model()

models = base_models()

# Train each of the models on the current TS


lst_tr_pred = {}
lst_val_preds = {}
for key, model in models.items():
    model.fit(
        raw.loc[train_indices, features],
        raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
    lst_tr_pred[key] = model.predict(raw.loc[train_indices, features])
    lst_val_preds[key] = model.predict(raw.loc[valid_indices, features])

train_preds = np.column_stack(tuple(lst_tr_pred.values()))
valid_preds = np.column_stack(tuple(lst_val_preds.values()))

meta_model = get_model() 
meta_model.fit(train_preds, raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
ypred = meta_model.predict(valid_preds)

#raw.loc[valid_indices, 'target'] = ypred
raw.loc[valid_indices, 'k'] = ypred + 1
raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']

# Validate
lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']

df = raw.loc[raw.dcount==(TS+1), 
             ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
df['pred'] = df['cfips'].map(dt)
df['lastval'] = df['cfips'].map(lastval)

#     df.loc[df['lastval'].isnull(), 'lastval'] = df.loc[df['lastval'].isnull(), 'microbusiness_density']    

df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']

print('Last Value SMAPE:', smape(df['microbusiness_density'], df['lastval']) )
print('SMAPE:', smape(df['microbusiness_density'], df['pred']))
print()

ind = (raw.dcount > MONTH_1)&(raw.dcount <= MONTH_last)

TS: 39

/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:177: UserWarning: Found `n_iter` in params. Will use it instead of argument
  _log_warning(f"Found `{alias}` in params. Will use it instead of argument")

[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
Last Value SMAPE: 1.889206717018118
SMAPE: 1.8637258032256854

for i in range(6):
    TS = TS+1
    print(f'TS: {TS}')
    train_indices = (raw.istest==0) & (raw.dcount  < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR) 
    valid_indices = (raw.dcount == TS)
    
    lst_tr_pred = {}
    lst_val_preds = {}
    for key, model in models.items():
            model.fit(
                raw.loc[train_indices, features],
                raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
            lst_tr_pred[key] = model.predict(raw.loc[train_indices, features])
            lst_val_preds[key] = model.predict(raw.loc[valid_indices, features])

    train_preds = np.column_stack(tuple(lst_tr_pred.values()))
    valid_preds = np.column_stack(tuple(lst_val_preds.values()))

    
    meta_model.fit(train_preds, raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
    ypred = meta_model.predict(valid_preds)

    raw.loc[(raw.dcount == TS), 'target'] = ypred
    # raw.loc[valid_indices, 'k'] = ypred + 1
    # raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']

    # #Validate
    # lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
    # dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']

    # df = raw.loc[raw.dcount==(TS+1), ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
    # df['pred'] = df['cfips'].map(dt)
    # df['lastval'] = df['cfips'].map(lastval)

    # df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
    # print(df['pred'].values)

    # raw.loc[raw.dcount==(TS+1), 'ypred'] = df['pred'].values
    # raw.loc[raw.dcount==(TS+1), 'ypred_last'] = df['lastval'].values


    raw.loc[(raw.dcount == TS+1), 'microbusiness_density'] = (1+ypred)*np.array(raw.loc[(raw.dcount == TS), 'microbusiness_density'])

TS: 40
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 41
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 42
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 43
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 44
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 45
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300

raw[['row_id','istest','target','microbusiness_density','k']].head(10)

	row_id	target	microbusiness_density	k
0	1001_2019-08-01	0.010101	2.856021	1.0
1	1001_2019-09-01	0.059265	2.884870	1.0
2	1001_2019-10-01	-0.020489	3.055843	1.0
3	1001_2019-11-01	0.000000	2.993233	1.0
4	1001_2019-12-01	-0.008066	2.993233	1.0
5	1001_2020-01-01	-0.020129	2.969090	1.0
6	1001_2020-02-01	0.008217	2.909326	1.0
7	1001_2020-03-01	0.022820	2.933231	1.0
8	1001_2020-04-01	0.001594	3.000167	1.0
9	1001_2020-05-01	0.004773	3.004948	1.0

test = raw[raw.first_day_of_month >= '2022-11-01'].copy()
test = test[['row_id', 'cfips', 'microbusiness_density']]
test = test[['row_id', 'microbusiness_density']]
test.to_csv('submission.csv', index=False)
test.tail(10)

	row_id	microbusiness_density
147296	56043_2023-05-01	3.042164
147297	56043_2023-06-01	3.036500
147337	56045_2022-11-01	1.785395
147338	56045_2022-12-01	1.803249
147339	56045_2023-01-01	1.807518
147340	56045_2023-02-01	1.803671
147341	56045_2023-03-01	1.798988
147342	56045_2023-04-01	1.795154
147343	56045_2023-05-01	1.791809
147344	56045_2023-06-01	1.788477

There are some anomalies, specially at timestep 18

SMAPE is a relative metric so target must be converted.

Build Features

Model Building

Submit submission