import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
BASE = '../input/godaddy-microbusiness-density-forecasting/'

def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)
census = pd.read_csv(BASE + 'census_starter.csv')
train = pd.read_csv(BASE + 'train.csv')
reaveal_test = pd.read_csv(BASE + 'revealed_test.csv')
train = pd.concat([train, reaveal_test]).sort_values(by=['cfips','first_day_of_month']).reset_index()
test = pd.read_csv(BASE + 'test.csv')
drop_index = (test.first_day_of_month == '2022-11-01') | (test.first_day_of_month == '2022-12-01')
test = test.loc[~drop_index,:]

sub = pd.read_csv(BASE + 'sample_submission.csv')
coords = pd.read_csv("/kaggle/input/usa-counties-coordinates/cfips_location.csv")
print(train.shape, test.shape, sub.shape)

train['istest'] = 0
test['istest'] = 1
raw = pd.concat((train, test)).sort_values(['cfips','row_id']).reset_index(drop=True)
raw = raw.merge(coords.drop("name", axis=1), on="cfips")

raw['state_i1'] = raw['state'].astype('category')
raw['county_i1'] = raw['county'].astype('category')
raw['first_day_of_month'] = pd.to_datetime(raw["first_day_of_month"])
raw['county'] = raw.groupby('cfips')['county'].ffill()
raw['state'] = raw.groupby('cfips')['state'].ffill()
raw["dcount"] = raw.groupby(['cfips'])['row_id'].cumcount()
raw['county_i'] = (raw['county'] + raw['state']).factorize()[0]
raw['state_i'] = raw['state'].factorize()[0]
raw['scale'] = (raw['first_day_of_month'] - raw['first_day_of_month'].min()).dt.days
raw['scale'] = raw['scale'].factorize()[0]
os.environ["CUDA_VISIBLE_DEVICES"]="0"
(128535, 8) (18810, 3) (25080, 2)
raw
index row_id cfips county state first_day_of_month microbusiness_density active istest lng lat state_i1 county_i1 dcount county_i state_i scale
0 0.0 1001_2019-08-01 1001 Autauga County Alabama 2019-08-01 3.007682 1249.0 0 -86.642900 32.535142 Alabama Autauga County 0 0 0 0
1 1.0 1001_2019-09-01 1001 Autauga County Alabama 2019-09-01 2.884870 1198.0 0 -86.642900 32.535142 Alabama Autauga County 1 0 0 1
2 2.0 1001_2019-10-01 1001 Autauga County Alabama 2019-10-01 3.055843 1269.0 0 -86.642900 32.535142 Alabama Autauga County 2 0 0 2
3 3.0 1001_2019-11-01 1001 Autauga County Alabama 2019-11-01 2.993233 1243.0 0 -86.642900 32.535142 Alabama Autauga County 3 0 0 3
4 4.0 1001_2019-12-01 1001 Autauga County Alabama 2019-12-01 2.993233 1243.0 0 -86.642900 32.535142 Alabama Autauga County 4 0 0 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
147340 NaN 56045_2023-02-01 56045 Weston County Wyoming 2023-02-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 42 3134 50 42
147341 NaN 56045_2023-03-01 56045 Weston County Wyoming 2023-03-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 43 3134 50 43
147342 NaN 56045_2023-04-01 56045 Weston County Wyoming 2023-04-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 44 3134 50 44
147343 NaN 56045_2023-05-01 56045 Weston County Wyoming 2023-05-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 45 3134 50 45
147344 NaN 56045_2023-06-01 56045 Weston County Wyoming 2023-06-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 46 3134 50 46

147345 rows × 17 columns

There are some anomalies, specially at timestep 18

for o in tqdm(raw.cfips.unique()): 
    indices = (raw['cfips'] == o) 
    tmp = raw.loc[indices].copy().reset_index(drop=True)
    var = tmp.microbusiness_density.values.copy()
    for i in range(37, 2, -1):
        thr = 0.10 * np.mean(var[:i]) 
        difa = var[i] - var[i - 1] 
        if (difa >= thr) or (difa <= -thr):              
            if difa > 0:
                var[:i] += difa - 0.003 
            else:
                var[:i] += difa + 0.003  
    var[0] = var[1] * 0.99
    raw.loc[indices, 'microbusiness_density'] = var
{"model_id":"c5336d6305e14089a90c3de2a2101eda","version_major":2,"version_minor":0}
raw
index row_id cfips county state first_day_of_month microbusiness_density active istest lng lat state_i1 county_i1 dcount county_i state_i scale
0 0.0 1001_2019-08-01 1001 Autauga County Alabama 2019-08-01 2.856021 1249.0 0 -86.642900 32.535142 Alabama Autauga County 0 0 0 0
1 1.0 1001_2019-09-01 1001 Autauga County Alabama 2019-09-01 2.884870 1198.0 0 -86.642900 32.535142 Alabama Autauga County 1 0 0 1
2 2.0 1001_2019-10-01 1001 Autauga County Alabama 2019-10-01 3.055843 1269.0 0 -86.642900 32.535142 Alabama Autauga County 2 0 0 2
3 3.0 1001_2019-11-01 1001 Autauga County Alabama 2019-11-01 2.993233 1243.0 0 -86.642900 32.535142 Alabama Autauga County 3 0 0 3
4 4.0 1001_2019-12-01 1001 Autauga County Alabama 2019-12-01 2.993233 1243.0 0 -86.642900 32.535142 Alabama Autauga County 4 0 0 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
147340 NaN 56045_2023-02-01 56045 Weston County Wyoming 2023-02-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 42 3134 50 42
147341 NaN 56045_2023-03-01 56045 Weston County Wyoming 2023-03-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 43 3134 50 43
147342 NaN 56045_2023-04-01 56045 Weston County Wyoming 2023-04-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 44 3134 50 44
147343 NaN 56045_2023-05-01 56045 Weston County Wyoming 2023-05-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 45 3134 50 45
147344 NaN 56045_2023-06-01 56045 Weston County Wyoming 2023-06-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 46 3134 50 46

147345 rows × 17 columns

lag = 1
raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')['microbusiness_density'].shift(lag).bfill()
raw[f'dif_{lag}'] = (raw['microbusiness_density'] / raw[f'mbd_lag_{lag}']).fillna(1).clip(0, None) - 1
raw.loc[(raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 0
raw.loc[(raw[f'microbusiness_density']>0) & (raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 1
raw[f'dif_{lag}'] = raw[f'dif_{lag}'].abs()
# raw.groupby('dcount')['dif'].sum().plot()
raw.tail()
index row_id cfips county state first_day_of_month microbusiness_density active istest lng lat state_i1 county_i1 dcount county_i state_i scale mbd_lag_1 dif_1
147340 NaN 56045_2023-02-01 56045 Weston County Wyoming 2023-02-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 42 3134 50 42 NaN 0.0
147341 NaN 56045_2023-03-01 56045 Weston County Wyoming 2023-03-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 43 3134 50 43 NaN 0.0
147342 NaN 56045_2023-04-01 56045 Weston County Wyoming 2023-04-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 44 3134 50 44 NaN 0.0
147343 NaN 56045_2023-05-01 56045 Weston County Wyoming 2023-05-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 45 3134 50 45 NaN 0.0
147344 NaN 56045_2023-06-01 56045 Weston County Wyoming 2023-06-01 NaN NaN 1 -104.567404 43.840315 NaN NaN 46 3134 50 46 NaN 0.0

SMAPE is a relative metric so target must be converted.

raw['target'] = raw.groupby('cfips')['microbusiness_density'].shift(-1)
raw['target'] = raw['target']/raw['microbusiness_density'] - 1


raw.loc[raw['cfips']==28055, 'target'] = 0.0
raw.loc[raw['cfips']==48269, 'target'] = 0.0
raw['lastactive'] = raw.groupby('cfips')['active'].transform('last')

# dt = raw.loc[raw.dcount==40].groupby('cfips')['microbusiness_density'].agg('last')
# raw['lastactive'].clip(0, 8000).hist(bins=30)
raw
index row_id cfips county state first_day_of_month microbusiness_density active istest lng ... state_i1 county_i1 dcount county_i state_i scale mbd_lag_1 dif_1 target lastactive
0 0.0 1001_2019-08-01 1001 Autauga County Alabama 2019-08-01 2.856021 1249.0 0 -86.642900 ... Alabama Autauga County 0 0 0 0 2.856021 0.000000 0.010101 1475.0
1 1.0 1001_2019-09-01 1001 Autauga County Alabama 2019-09-01 2.884870 1198.0 0 -86.642900 ... Alabama Autauga County 1 0 0 1 2.856021 0.010101 0.059265 1475.0
2 2.0 1001_2019-10-01 1001 Autauga County Alabama 2019-10-01 3.055843 1269.0 0 -86.642900 ... Alabama Autauga County 2 0 0 2 2.884870 0.059265 -0.020489 1475.0
3 3.0 1001_2019-11-01 1001 Autauga County Alabama 2019-11-01 2.993233 1243.0 0 -86.642900 ... Alabama Autauga County 3 0 0 3 3.055843 0.020489 0.000000 1475.0
4 4.0 1001_2019-12-01 1001 Autauga County Alabama 2019-12-01 2.993233 1243.0 0 -86.642900 ... Alabama Autauga County 4 0 0 4 2.993233 0.000000 -0.008066 1475.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
147340 NaN 56045_2023-02-01 56045 Weston County Wyoming 2023-02-01 NaN NaN 1 -104.567404 ... NaN NaN 42 3134 50 42 NaN 0.000000 NaN 101.0
147341 NaN 56045_2023-03-01 56045 Weston County Wyoming 2023-03-01 NaN NaN 1 -104.567404 ... NaN NaN 43 3134 50 43 NaN 0.000000 NaN 101.0
147342 NaN 56045_2023-04-01 56045 Weston County Wyoming 2023-04-01 NaN NaN 1 -104.567404 ... NaN NaN 44 3134 50 44 NaN 0.000000 NaN 101.0
147343 NaN 56045_2023-05-01 56045 Weston County Wyoming 2023-05-01 NaN NaN 1 -104.567404 ... NaN NaN 45 3134 50 45 NaN 0.000000 NaN 101.0
147344 NaN 56045_2023-06-01 56045 Weston County Wyoming 2023-06-01 NaN NaN 1 -104.567404 ... NaN NaN 46 3134 50 46 NaN 0.000000 NaN 101.0

147345 rows × 21 columns

Build Features

def build_features(raw, target='microbusiness_density', target_act='active_tmp', lags = 6):
    feats = []   

    for lag in range(1, lags):
        raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')[target].shift(lag)
        raw[f'act_lag_{lag}'] = raw.groupby('cfips')[target_act].diff(lag)
        feats.append(f'mbd_lag_{lag}')
        feats.append(f'act_lag_{lag}')
        
    lag = 1
    for window in [2, 4, 6, 8, 10]:
        raw[f'mbd_rollmea{window}_{lag}'] = raw.groupby('cfips')[f'mbd_lag_{lag}'].transform(lambda s: s.rolling(window, min_periods=1).sum())        
        feats.append(f'mbd_rollmea{window}_{lag}')
    
    census_columns = list(census.columns)
    census_columns.remove( "cfips")
    
    raw = raw.merge(census, on="cfips", how="left")
    feats += census_columns
    
    co_est = pd.read_csv("/kaggle/input/us-indicator/co-est2021-alldata.csv", encoding='latin-1')
    co_est["cfips"] = co_est.STATE*1000 + co_est.COUNTY
    co_columns = [
        'SUMLEV',
        'DIVISION',
        'ESTIMATESBASE2020',
        'POPESTIMATE2020',
        'POPESTIMATE2021',
        'NPOPCHG2020',
        'NPOPCHG2021',
        'BIRTHS2020',
        'BIRTHS2021',
        'DEATHS2020',
        'DEATHS2021',
        'NATURALCHG2020',
        'NATURALCHG2021',
        'INTERNATIONALMIG2020',
        'INTERNATIONALMIG2021',
        'DOMESTICMIG2020',
        'DOMESTICMIG2021',
        'NETMIG2020',
        'NETMIG2021',
        'RESIDUAL2020',
        'RESIDUAL2021',
        'GQESTIMATESBASE2020',
        'GQESTIMATES2020',
        'GQESTIMATES2021',
        'RBIRTH2021',
        'RDEATH2021',
        'RNATURALCHG2021',
        'RINTERNATIONALMIG2021',
        'RDOMESTICMIG2021',
        'RNETMIG2021'
    ]
    raw = raw.merge(co_est, on="cfips", how="left")
    feats +=  co_columns
    return raw, feats
# Build Features based in lag of target
raw, feats = build_features(raw, 'target', 'active', lags = 9)
features = ['state_i']
features += feats
features += ['lng','lat','scale']
# print(features)
# raw.loc[raw.dcount==40, features].head(10)

Latitude and Longitude feature engineering from samu2505.

coordinates = raw[['lng', 'lat']].values

# Encoding tricks
emb_size = 20
precision = 1e6

latlon = np.expand_dims(coordinates, axis=-1)

m = np.exp(np.log(precision)/emb_size)
angle_freq = m ** np.arange(emb_size)
angle_freq = angle_freq.reshape(1,1, emb_size)
latlon = latlon * angle_freq
latlon[..., 0::2] = np.cos(latlon[..., 0::2])
def rot(df):
    for angle in [15, 30, 45]:
        df[f'rot_{angle}_x'] = (np.cos(np.radians(angle)) * df['lat']) + \
                                (np.sin(np.radians(angle)) * df['lng'])
        
        df[f'rot_{angle}_y'] = (np.cos(np.radians(angle)) * df['lat']) - \
                                (np.sin(np.radians(angle)) * df['lng'])
        
    return df

raw = rot(raw)
features += ['rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'rot_45_y']

Model Building

def get_model():
    from sklearn.ensemble import VotingRegressor
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cat
    from sklearn.pipeline import Pipeline
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.impute import KNNImputer    

# we should decrease the num_iterations of catboost
    cat_model = cat.CatBoostRegressor(
        iterations=2000,
        loss_function="MAPE",
        verbose=0,
        grow_policy='SymmetricTree',
        learning_rate=0.035,
        colsample_bylevel=0.8,
        max_depth=5,
        l2_leaf_reg=0.2,
        subsample=0.70,
        max_bin=4096,
    )

    return cat_model


def base_models():
    from sklearn.ensemble import VotingRegressor
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cat
    from sklearn.pipeline import Pipeline
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.impute import KNNImputer    
    
    # 
    
    # LGBM model
    params = {
    'n_iter': 300,
    'boosting_type': 'dart',
    'verbosity': -1,
    'objective': 'l1',
    'random_state': 42,
    'colsample_bytree': 0.8841279649367693,
    'colsample_bynode': 0.10142964450634374,
    'max_depth': 8,
    'learning_rate': 0.003647749926797374,
    'lambda_l2': 0.5,
    'num_leaves': 61,
    "seed": 42,
    'min_data_in_leaf': 213}

    lgb_model = lgb.LGBMRegressor(**params)
    
    xgb_model = xgb.XGBRegressor(
    objective='reg:pseudohubererror',
    tree_method="hist",
    n_estimators=795,
    learning_rate=0.0075,
    max_leaves = 17,
    subsample=0.50,
    colsample_bytree=0.50,
    max_bin=4096,
    n_jobs=2)

    # we should decrease the num_iterations of catboost
    cat_model = cat.CatBoostRegressor(
        iterations=2000,
        loss_function="MAPE",
        verbose=0,
        grow_policy='SymmetricTree',
        learning_rate=0.035,
        colsample_bylevel=0.8,
        max_depth=5,
        l2_leaf_reg=0.2,
        subsample=0.70,
        max_bin=4096,
    )
    
    models = {}
    models['xgb'] = xgb_model
    models['lgbm'] = lgb_model
    models['cat'] = cat_model

    return models
ACT_THR = 150
MONTH_1 = 20
MONTH_last = 40

raw['k'] = 1.
raw['microbusiness_density'].fillna(0, inplace = True)


TS = 39
print(f'TS: {TS}')

train_indices = (raw.istest==0) & (raw.dcount  < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR) 
valid_indices = (raw.istest==0) & (raw.dcount == TS)

# model = get_model()

models = base_models()

# Train each of the models on the current TS


lst_tr_pred = {}
lst_val_preds = {}
for key, model in models.items():
    model.fit(
        raw.loc[train_indices, features],
        raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
    lst_tr_pred[key] = model.predict(raw.loc[train_indices, features])
    lst_val_preds[key] = model.predict(raw.loc[valid_indices, features])

train_preds = np.column_stack(tuple(lst_tr_pred.values()))
valid_preds = np.column_stack(tuple(lst_val_preds.values()))

meta_model = get_model() 
meta_model.fit(train_preds, raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
ypred = meta_model.predict(valid_preds)

#raw.loc[valid_indices, 'target'] = ypred
raw.loc[valid_indices, 'k'] = ypred + 1
raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']

# Validate
lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']

df = raw.loc[raw.dcount==(TS+1), 
             ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
df['pred'] = df['cfips'].map(dt)
df['lastval'] = df['cfips'].map(lastval)

#     df.loc[df['lastval'].isnull(), 'lastval'] = df.loc[df['lastval'].isnull(), 'microbusiness_density']    

df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']

print('Last Value SMAPE:', smape(df['microbusiness_density'], df['lastval']) )
print('SMAPE:', smape(df['microbusiness_density'], df['pred']))
print()

ind = (raw.dcount > MONTH_1)&(raw.dcount <= MONTH_last)
TS: 39
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:177: UserWarning: Found `n_iter` in params. Will use it instead of argument
  _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
Last Value SMAPE: 1.889206717018118
SMAPE: 1.8637258032256854

for i in range(6):
    TS = TS+1
    print(f'TS: {TS}')
    train_indices = (raw.istest==0) & (raw.dcount  < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR) 
    valid_indices = (raw.dcount == TS)
    
    lst_tr_pred = {}
    lst_val_preds = {}
    for key, model in models.items():
            model.fit(
                raw.loc[train_indices, features],
                raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
            lst_tr_pred[key] = model.predict(raw.loc[train_indices, features])
            lst_val_preds[key] = model.predict(raw.loc[valid_indices, features])

    train_preds = np.column_stack(tuple(lst_tr_pred.values()))
    valid_preds = np.column_stack(tuple(lst_val_preds.values()))

    
    meta_model.fit(train_preds, raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
    ypred = meta_model.predict(valid_preds)

    raw.loc[(raw.dcount == TS), 'target'] = ypred
    # raw.loc[valid_indices, 'k'] = ypred + 1
    # raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']

    # #Validate
    # lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
    # dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']

    # df = raw.loc[raw.dcount==(TS+1), ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
    # df['pred'] = df['cfips'].map(dt)
    # df['lastval'] = df['cfips'].map(lastval)

    # df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
    # print(df['pred'].values)

    # raw.loc[raw.dcount==(TS+1), 'ypred'] = df['pred'].values
    # raw.loc[raw.dcount==(TS+1), 'ypred_last'] = df['lastval'].values


    raw.loc[(raw.dcount == TS+1), 'microbusiness_density'] = (1+ypred)*np.array(raw.loc[(raw.dcount == TS), 'microbusiness_density'])
TS: 40
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 41
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 42
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 43
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 44
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 45
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
raw[['row_id','istest','target','microbusiness_density','k']].head(10)
row_id istest target microbusiness_density k
0 1001_2019-08-01 0 0.010101 2.856021 1.0
1 1001_2019-09-01 0 0.059265 2.884870 1.0
2 1001_2019-10-01 0 -0.020489 3.055843 1.0
3 1001_2019-11-01 0 0.000000 2.993233 1.0
4 1001_2019-12-01 0 -0.008066 2.993233 1.0
5 1001_2020-01-01 0 -0.020129 2.969090 1.0
6 1001_2020-02-01 0 0.008217 2.909326 1.0
7 1001_2020-03-01 0 0.022820 2.933231 1.0
8 1001_2020-04-01 0 0.001594 3.000167 1.0
9 1001_2020-05-01 0 0.004773 3.004948 1.0

Submit submission

test = raw[raw.first_day_of_month >= '2022-11-01'].copy()
test = test[['row_id', 'cfips', 'microbusiness_density']]
test = test[['row_id', 'microbusiness_density']]
test.to_csv('submission.csv', index=False)
test.tail(10)
row_id microbusiness_density
147296 56043_2023-05-01 3.042164
147297 56043_2023-06-01 3.036500
147337 56045_2022-11-01 1.785395
147338 56045_2022-12-01 1.803249
147339 56045_2023-01-01 1.807518
147340 56045_2023-02-01 1.803671
147341 56045_2023-03-01 1.798988
147342 56045_2023-04-01 1.795154
147343 56045_2023-05-01 1.791809
147344 56045_2023-06-01 1.788477