import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
BASE = '../input/godaddy-microbusiness-density-forecasting/'
def smape(y_true, y_pred):
smap = np.zeros(len(y_true))
num = np.abs(y_true - y_pred)
dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
pos_ind = (y_true!=0)|(y_pred!=0)
smap[pos_ind] = num[pos_ind] / dem[pos_ind]
return 100 * np.mean(smap)census = pd.read_csv(BASE + 'census_starter.csv')
train = pd.read_csv(BASE + 'train.csv')
reaveal_test = pd.read_csv(BASE + 'revealed_test.csv')
train = pd.concat([train, reaveal_test]).sort_values(by=['cfips','first_day_of_month']).reset_index()
test = pd.read_csv(BASE + 'test.csv')
drop_index = (test.first_day_of_month == '2022-11-01') | (test.first_day_of_month == '2022-12-01')
test = test.loc[~drop_index,:]
sub = pd.read_csv(BASE + 'sample_submission.csv')
coords = pd.read_csv("/kaggle/input/usa-counties-coordinates/cfips_location.csv")
print(train.shape, test.shape, sub.shape)
train['istest'] = 0
test['istest'] = 1
raw = pd.concat((train, test)).sort_values(['cfips','row_id']).reset_index(drop=True)
raw = raw.merge(coords.drop("name", axis=1), on="cfips")
raw['state_i1'] = raw['state'].astype('category')
raw['county_i1'] = raw['county'].astype('category')
raw['first_day_of_month'] = pd.to_datetime(raw["first_day_of_month"])
raw['county'] = raw.groupby('cfips')['county'].ffill()
raw['state'] = raw.groupby('cfips')['state'].ffill()
raw["dcount"] = raw.groupby(['cfips'])['row_id'].cumcount()
raw['county_i'] = (raw['county'] + raw['state']).factorize()[0]
raw['state_i'] = raw['state'].factorize()[0]
raw['scale'] = (raw['first_day_of_month'] - raw['first_day_of_month'].min()).dt.days
raw['scale'] = raw['scale'].factorize()[0]
os.environ["CUDA_VISIBLE_DEVICES"]="0"(128535, 8) (18810, 3) (25080, 2)
raw| index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | lat | state_i1 | county_i1 | dcount | county_i | state_i | scale | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1001_2019-08-01 | 1001 | Autauga County | Alabama | 2019-08-01 | 3.007682 | 1249.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 0 | 0 | 0 | 0 |
| 1 | 1.0 | 1001_2019-09-01 | 1001 | Autauga County | Alabama | 2019-09-01 | 2.884870 | 1198.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 1 | 0 | 0 | 1 |
| 2 | 2.0 | 1001_2019-10-01 | 1001 | Autauga County | Alabama | 2019-10-01 | 3.055843 | 1269.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 2 | 0 | 0 | 2 |
| 3 | 3.0 | 1001_2019-11-01 | 1001 | Autauga County | Alabama | 2019-11-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 3 | 0 | 0 | 3 |
| 4 | 4.0 | 1001_2019-12-01 | 1001 | Autauga County | Alabama | 2019-12-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 4 | 0 | 0 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 42 | 3134 | 50 | 42 |
| 147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 43 | 3134 | 50 | 43 |
| 147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 44 | 3134 | 50 | 44 |
| 147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 45 | 3134 | 50 | 45 |
| 147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 46 | 3134 | 50 | 46 |
147345 rows × 17 columns
for o in tqdm(raw.cfips.unique()):
indices = (raw['cfips'] == o)
tmp = raw.loc[indices].copy().reset_index(drop=True)
var = tmp.microbusiness_density.values.copy()
for i in range(37, 2, -1):
thr = 0.10 * np.mean(var[:i])
difa = var[i] - var[i - 1]
if (difa >= thr) or (difa <= -thr):
if difa > 0:
var[:i] += difa - 0.003
else:
var[:i] += difa + 0.003
var[0] = var[1] * 0.99
raw.loc[indices, 'microbusiness_density'] = var{"model_id":"c5336d6305e14089a90c3de2a2101eda","version_major":2,"version_minor":0}raw| index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | lat | state_i1 | county_i1 | dcount | county_i | state_i | scale | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1001_2019-08-01 | 1001 | Autauga County | Alabama | 2019-08-01 | 2.856021 | 1249.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 0 | 0 | 0 | 0 |
| 1 | 1.0 | 1001_2019-09-01 | 1001 | Autauga County | Alabama | 2019-09-01 | 2.884870 | 1198.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 1 | 0 | 0 | 1 |
| 2 | 2.0 | 1001_2019-10-01 | 1001 | Autauga County | Alabama | 2019-10-01 | 3.055843 | 1269.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 2 | 0 | 0 | 2 |
| 3 | 3.0 | 1001_2019-11-01 | 1001 | Autauga County | Alabama | 2019-11-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 3 | 0 | 0 | 3 |
| 4 | 4.0 | 1001_2019-12-01 | 1001 | Autauga County | Alabama | 2019-12-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 4 | 0 | 0 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 42 | 3134 | 50 | 42 |
| 147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 43 | 3134 | 50 | 43 |
| 147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 44 | 3134 | 50 | 44 |
| 147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 45 | 3134 | 50 | 45 |
| 147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 46 | 3134 | 50 | 46 |
147345 rows × 17 columns
lag = 1
raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')['microbusiness_density'].shift(lag).bfill()
raw[f'dif_{lag}'] = (raw['microbusiness_density'] / raw[f'mbd_lag_{lag}']).fillna(1).clip(0, None) - 1
raw.loc[(raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 0
raw.loc[(raw[f'microbusiness_density']>0) & (raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 1
raw[f'dif_{lag}'] = raw[f'dif_{lag}'].abs()
# raw.groupby('dcount')['dif'].sum().plot()raw.tail()| index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | lat | state_i1 | county_i1 | dcount | county_i | state_i | scale | mbd_lag_1 | dif_1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 42 | 3134 | 50 | 42 | NaN | 0.0 |
| 147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 43 | 3134 | 50 | 43 | NaN | 0.0 |
| 147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 44 | 3134 | 50 | 44 | NaN | 0.0 |
| 147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 45 | 3134 | 50 | 45 | NaN | 0.0 |
| 147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 46 | 3134 | 50 | 46 | NaN | 0.0 |
raw['target'] = raw.groupby('cfips')['microbusiness_density'].shift(-1)
raw['target'] = raw['target']/raw['microbusiness_density'] - 1
raw.loc[raw['cfips']==28055, 'target'] = 0.0
raw.loc[raw['cfips']==48269, 'target'] = 0.0raw['lastactive'] = raw.groupby('cfips')['active'].transform('last')
# dt = raw.loc[raw.dcount==40].groupby('cfips')['microbusiness_density'].agg('last')
# raw['lastactive'].clip(0, 8000).hist(bins=30)raw| index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | ... | state_i1 | county_i1 | dcount | county_i | state_i | scale | mbd_lag_1 | dif_1 | target | lastactive | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1001_2019-08-01 | 1001 | Autauga County | Alabama | 2019-08-01 | 2.856021 | 1249.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 0 | 0 | 0 | 0 | 2.856021 | 0.000000 | 0.010101 | 1475.0 |
| 1 | 1.0 | 1001_2019-09-01 | 1001 | Autauga County | Alabama | 2019-09-01 | 2.884870 | 1198.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 1 | 0 | 0 | 1 | 2.856021 | 0.010101 | 0.059265 | 1475.0 |
| 2 | 2.0 | 1001_2019-10-01 | 1001 | Autauga County | Alabama | 2019-10-01 | 3.055843 | 1269.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 2 | 0 | 0 | 2 | 2.884870 | 0.059265 | -0.020489 | 1475.0 |
| 3 | 3.0 | 1001_2019-11-01 | 1001 | Autauga County | Alabama | 2019-11-01 | 2.993233 | 1243.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 3 | 0 | 0 | 3 | 3.055843 | 0.020489 | 0.000000 | 1475.0 |
| 4 | 4.0 | 1001_2019-12-01 | 1001 | Autauga County | Alabama | 2019-12-01 | 2.993233 | 1243.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 4 | 0 | 0 | 4 | 2.993233 | 0.000000 | -0.008066 | 1475.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 42 | 3134 | 50 | 42 | NaN | 0.000000 | NaN | 101.0 |
| 147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 43 | 3134 | 50 | 43 | NaN | 0.000000 | NaN | 101.0 |
| 147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 44 | 3134 | 50 | 44 | NaN | 0.000000 | NaN | 101.0 |
| 147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 45 | 3134 | 50 | 45 | NaN | 0.000000 | NaN | 101.0 |
| 147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 46 | 3134 | 50 | 46 | NaN | 0.000000 | NaN | 101.0 |
147345 rows × 21 columns
def build_features(raw, target='microbusiness_density', target_act='active_tmp', lags = 6):
feats = []
for lag in range(1, lags):
raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')[target].shift(lag)
raw[f'act_lag_{lag}'] = raw.groupby('cfips')[target_act].diff(lag)
feats.append(f'mbd_lag_{lag}')
feats.append(f'act_lag_{lag}')
lag = 1
for window in [2, 4, 6, 8, 10]:
raw[f'mbd_rollmea{window}_{lag}'] = raw.groupby('cfips')[f'mbd_lag_{lag}'].transform(lambda s: s.rolling(window, min_periods=1).sum())
feats.append(f'mbd_rollmea{window}_{lag}')
census_columns = list(census.columns)
census_columns.remove( "cfips")
raw = raw.merge(census, on="cfips", how="left")
feats += census_columns
co_est = pd.read_csv("/kaggle/input/us-indicator/co-est2021-alldata.csv", encoding='latin-1')
co_est["cfips"] = co_est.STATE*1000 + co_est.COUNTY
co_columns = [
'SUMLEV',
'DIVISION',
'ESTIMATESBASE2020',
'POPESTIMATE2020',
'POPESTIMATE2021',
'NPOPCHG2020',
'NPOPCHG2021',
'BIRTHS2020',
'BIRTHS2021',
'DEATHS2020',
'DEATHS2021',
'NATURALCHG2020',
'NATURALCHG2021',
'INTERNATIONALMIG2020',
'INTERNATIONALMIG2021',
'DOMESTICMIG2020',
'DOMESTICMIG2021',
'NETMIG2020',
'NETMIG2021',
'RESIDUAL2020',
'RESIDUAL2021',
'GQESTIMATESBASE2020',
'GQESTIMATES2020',
'GQESTIMATES2021',
'RBIRTH2021',
'RDEATH2021',
'RNATURALCHG2021',
'RINTERNATIONALMIG2021',
'RDOMESTICMIG2021',
'RNETMIG2021'
]
raw = raw.merge(co_est, on="cfips", how="left")
feats += co_columns
return raw, feats# Build Features based in lag of target
raw, feats = build_features(raw, 'target', 'active', lags = 9)
features = ['state_i']
features += feats
features += ['lng','lat','scale']
# print(features)
# raw.loc[raw.dcount==40, features].head(10)Latitude and Longitude feature engineering from samu2505.
coordinates = raw[['lng', 'lat']].values
# Encoding tricks
emb_size = 20
precision = 1e6
latlon = np.expand_dims(coordinates, axis=-1)
m = np.exp(np.log(precision)/emb_size)
angle_freq = m ** np.arange(emb_size)
angle_freq = angle_freq.reshape(1,1, emb_size)
latlon = latlon * angle_freq
latlon[..., 0::2] = np.cos(latlon[..., 0::2])def rot(df):
for angle in [15, 30, 45]:
df[f'rot_{angle}_x'] = (np.cos(np.radians(angle)) * df['lat']) + \
(np.sin(np.radians(angle)) * df['lng'])
df[f'rot_{angle}_y'] = (np.cos(np.radians(angle)) * df['lat']) - \
(np.sin(np.radians(angle)) * df['lng'])
return df
raw = rot(raw)features += ['rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'rot_45_y']def get_model():
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
# we should decrease the num_iterations of catboost
cat_model = cat.CatBoostRegressor(
iterations=2000,
loss_function="MAPE",
verbose=0,
grow_policy='SymmetricTree',
learning_rate=0.035,
colsample_bylevel=0.8,
max_depth=5,
l2_leaf_reg=0.2,
subsample=0.70,
max_bin=4096,
)
return cat_model
def base_models():
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
#
# LGBM model
params = {
'n_iter': 300,
'boosting_type': 'dart',
'verbosity': -1,
'objective': 'l1',
'random_state': 42,
'colsample_bytree': 0.8841279649367693,
'colsample_bynode': 0.10142964450634374,
'max_depth': 8,
'learning_rate': 0.003647749926797374,
'lambda_l2': 0.5,
'num_leaves': 61,
"seed": 42,
'min_data_in_leaf': 213}
lgb_model = lgb.LGBMRegressor(**params)
xgb_model = xgb.XGBRegressor(
objective='reg:pseudohubererror',
tree_method="hist",
n_estimators=795,
learning_rate=0.0075,
max_leaves = 17,
subsample=0.50,
colsample_bytree=0.50,
max_bin=4096,
n_jobs=2)
# we should decrease the num_iterations of catboost
cat_model = cat.CatBoostRegressor(
iterations=2000,
loss_function="MAPE",
verbose=0,
grow_policy='SymmetricTree',
learning_rate=0.035,
colsample_bylevel=0.8,
max_depth=5,
l2_leaf_reg=0.2,
subsample=0.70,
max_bin=4096,
)
models = {}
models['xgb'] = xgb_model
models['lgbm'] = lgb_model
models['cat'] = cat_model
return modelsACT_THR = 150
MONTH_1 = 20
MONTH_last = 40
raw['k'] = 1.
raw['microbusiness_density'].fillna(0, inplace = True)
TS = 39
print(f'TS: {TS}')
train_indices = (raw.istest==0) & (raw.dcount < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR)
valid_indices = (raw.istest==0) & (raw.dcount == TS)
# model = get_model()
models = base_models()
# Train each of the models on the current TS
lst_tr_pred = {}
lst_val_preds = {}
for key, model in models.items():
model.fit(
raw.loc[train_indices, features],
raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
lst_tr_pred[key] = model.predict(raw.loc[train_indices, features])
lst_val_preds[key] = model.predict(raw.loc[valid_indices, features])
train_preds = np.column_stack(tuple(lst_tr_pred.values()))
valid_preds = np.column_stack(tuple(lst_val_preds.values()))
meta_model = get_model()
meta_model.fit(train_preds, raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
ypred = meta_model.predict(valid_preds)
#raw.loc[valid_indices, 'target'] = ypred
raw.loc[valid_indices, 'k'] = ypred + 1
raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']
# Validate
lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']
df = raw.loc[raw.dcount==(TS+1),
['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
df['pred'] = df['cfips'].map(dt)
df['lastval'] = df['cfips'].map(lastval)
# df.loc[df['lastval'].isnull(), 'lastval'] = df.loc[df['lastval'].isnull(), 'microbusiness_density']
df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
print('Last Value SMAPE:', smape(df['microbusiness_density'], df['lastval']) )
print('SMAPE:', smape(df['microbusiness_density'], df['pred']))
print()
ind = (raw.dcount > MONTH_1)&(raw.dcount <= MONTH_last)TS: 39
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:177: UserWarning: Found `n_iter` in params. Will use it instead of argument
_log_warning(f"Found `{alias}` in params. Will use it instead of argument")
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
Last Value SMAPE: 1.889206717018118
SMAPE: 1.8637258032256854
for i in range(6):
TS = TS+1
print(f'TS: {TS}')
train_indices = (raw.istest==0) & (raw.dcount < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR)
valid_indices = (raw.dcount == TS)
lst_tr_pred = {}
lst_val_preds = {}
for key, model in models.items():
model.fit(
raw.loc[train_indices, features],
raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
lst_tr_pred[key] = model.predict(raw.loc[train_indices, features])
lst_val_preds[key] = model.predict(raw.loc[valid_indices, features])
train_preds = np.column_stack(tuple(lst_tr_pred.values()))
valid_preds = np.column_stack(tuple(lst_val_preds.values()))
meta_model.fit(train_preds, raw.loc[train_indices, 'target'].clip(-0.002, 0.006))
ypred = meta_model.predict(valid_preds)
raw.loc[(raw.dcount == TS), 'target'] = ypred
# raw.loc[valid_indices, 'k'] = ypred + 1
# raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']
# #Validate
# lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
# dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']
# df = raw.loc[raw.dcount==(TS+1), ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
# df['pred'] = df['cfips'].map(dt)
# df['lastval'] = df['cfips'].map(lastval)
# df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
# print(df['pred'].values)
# raw.loc[raw.dcount==(TS+1), 'ypred'] = df['pred'].values
# raw.loc[raw.dcount==(TS+1), 'ypred_last'] = df['lastval'].values
raw.loc[(raw.dcount == TS+1), 'microbusiness_density'] = (1+ypred)*np.array(raw.loc[(raw.dcount == TS), 'microbusiness_density'])TS: 40
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 41
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 42
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 43
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 44
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 45
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
raw[['row_id','istest','target','microbusiness_density','k']].head(10)| row_id | istest | target | microbusiness_density | k | |
|---|---|---|---|---|---|
| 0 | 1001_2019-08-01 | 0 | 0.010101 | 2.856021 | 1.0 |
| 1 | 1001_2019-09-01 | 0 | 0.059265 | 2.884870 | 1.0 |
| 2 | 1001_2019-10-01 | 0 | -0.020489 | 3.055843 | 1.0 |
| 3 | 1001_2019-11-01 | 0 | 0.000000 | 2.993233 | 1.0 |
| 4 | 1001_2019-12-01 | 0 | -0.008066 | 2.993233 | 1.0 |
| 5 | 1001_2020-01-01 | 0 | -0.020129 | 2.969090 | 1.0 |
| 6 | 1001_2020-02-01 | 0 | 0.008217 | 2.909326 | 1.0 |
| 7 | 1001_2020-03-01 | 0 | 0.022820 | 2.933231 | 1.0 |
| 8 | 1001_2020-04-01 | 0 | 0.001594 | 3.000167 | 1.0 |
| 9 | 1001_2020-05-01 | 0 | 0.004773 | 3.004948 | 1.0 |
test = raw[raw.first_day_of_month >= '2022-11-01'].copy()
test = test[['row_id', 'cfips', 'microbusiness_density']]
test = test[['row_id', 'microbusiness_density']]
test.to_csv('submission.csv', index=False)
test.tail(10)| row_id | microbusiness_density | |
|---|---|---|
| 147296 | 56043_2023-05-01 | 3.042164 |
| 147297 | 56043_2023-06-01 | 3.036500 |
| 147337 | 56045_2022-11-01 | 1.785395 |
| 147338 | 56045_2022-12-01 | 1.803249 |
| 147339 | 56045_2023-01-01 | 1.807518 |
| 147340 | 56045_2023-02-01 | 1.803671 |
| 147341 | 56045_2023-03-01 | 1.798988 |
| 147342 | 56045_2023-04-01 | 1.795154 |
| 147343 | 56045_2023-05-01 | 1.791809 |
| 147344 | 56045_2023-06-01 | 1.788477 |