import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
= '../input/godaddy-microbusiness-density-forecasting/'
BASE
def smape(y_true, y_pred):
= np.zeros(len(y_true))
smap
= np.abs(y_true - y_pred)
num = ((np.abs(y_true) + np.abs(y_pred)) / 2)
dem
= (y_true!=0)|(y_pred!=0)
pos_ind = num[pos_ind] / dem[pos_ind]
smap[pos_ind]
return 100 * np.mean(smap)
= pd.read_csv(BASE + 'census_starter.csv')
census = pd.read_csv(BASE + 'train.csv')
train = pd.read_csv(BASE + 'revealed_test.csv')
reaveal_test = pd.concat([train, reaveal_test]).sort_values(by=['cfips','first_day_of_month']).reset_index()
train = pd.read_csv(BASE + 'test.csv')
test = (test.first_day_of_month == '2022-11-01') | (test.first_day_of_month == '2022-12-01')
drop_index = test.loc[~drop_index,:]
test
= pd.read_csv(BASE + 'sample_submission.csv')
sub = pd.read_csv("/kaggle/input/usa-counties-coordinates/cfips_location.csv")
coords print(train.shape, test.shape, sub.shape)
'istest'] = 0
train['istest'] = 1
test[= pd.concat((train, test)).sort_values(['cfips','row_id']).reset_index(drop=True)
raw = raw.merge(coords.drop("name", axis=1), on="cfips")
raw
'state_i1'] = raw['state'].astype('category')
raw['county_i1'] = raw['county'].astype('category')
raw['first_day_of_month'] = pd.to_datetime(raw["first_day_of_month"])
raw['county'] = raw.groupby('cfips')['county'].ffill()
raw['state'] = raw.groupby('cfips')['state'].ffill()
raw["dcount"] = raw.groupby(['cfips'])['row_id'].cumcount()
raw['county_i'] = (raw['county'] + raw['state']).factorize()[0]
raw['state_i'] = raw['state'].factorize()[0]
raw['scale'] = (raw['first_day_of_month'] - raw['first_day_of_month'].min()).dt.days
raw['scale'] = raw['scale'].factorize()[0]
raw["CUDA_VISIBLE_DEVICES"]="0" os.environ[
(128535, 8) (18810, 3) (25080, 2)
raw
index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | lat | state_i1 | county_i1 | dcount | county_i | state_i | scale | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1001_2019-08-01 | 1001 | Autauga County | Alabama | 2019-08-01 | 3.007682 | 1249.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 0 | 0 | 0 | 0 |
1 | 1.0 | 1001_2019-09-01 | 1001 | Autauga County | Alabama | 2019-09-01 | 2.884870 | 1198.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 1 | 0 | 0 | 1 |
2 | 2.0 | 1001_2019-10-01 | 1001 | Autauga County | Alabama | 2019-10-01 | 3.055843 | 1269.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 2 | 0 | 0 | 2 |
3 | 3.0 | 1001_2019-11-01 | 1001 | Autauga County | Alabama | 2019-11-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 3 | 0 | 0 | 3 |
4 | 4.0 | 1001_2019-12-01 | 1001 | Autauga County | Alabama | 2019-12-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 4 | 0 | 0 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 42 | 3134 | 50 | 42 |
147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 43 | 3134 | 50 | 43 |
147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 44 | 3134 | 50 | 44 |
147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 45 | 3134 | 50 | 45 |
147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 46 | 3134 | 50 | 46 |
147345 rows × 17 columns
for o in tqdm(raw.cfips.unique()):
= (raw['cfips'] == o)
indices = raw.loc[indices].copy().reset_index(drop=True)
tmp = tmp.microbusiness_density.values.copy()
var for i in range(37, 2, -1):
= 0.10 * np.mean(var[:i])
thr = var[i] - var[i - 1]
difa if (difa >= thr) or (difa <= -thr):
if difa > 0:
+= difa - 0.003
var[:i] else:
+= difa + 0.003
var[:i] 0] = var[1] * 0.99
var['microbusiness_density'] = var raw.loc[indices,
{"model_id":"c5336d6305e14089a90c3de2a2101eda","version_major":2,"version_minor":0}
raw
index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | lat | state_i1 | county_i1 | dcount | county_i | state_i | scale | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1001_2019-08-01 | 1001 | Autauga County | Alabama | 2019-08-01 | 2.856021 | 1249.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 0 | 0 | 0 | 0 |
1 | 1.0 | 1001_2019-09-01 | 1001 | Autauga County | Alabama | 2019-09-01 | 2.884870 | 1198.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 1 | 0 | 0 | 1 |
2 | 2.0 | 1001_2019-10-01 | 1001 | Autauga County | Alabama | 2019-10-01 | 3.055843 | 1269.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 2 | 0 | 0 | 2 |
3 | 3.0 | 1001_2019-11-01 | 1001 | Autauga County | Alabama | 2019-11-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 3 | 0 | 0 | 3 |
4 | 4.0 | 1001_2019-12-01 | 1001 | Autauga County | Alabama | 2019-12-01 | 2.993233 | 1243.0 | 0 | -86.642900 | 32.535142 | Alabama | Autauga County | 4 | 0 | 0 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 42 | 3134 | 50 | 42 |
147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 43 | 3134 | 50 | 43 |
147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 44 | 3134 | 50 | 44 |
147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 45 | 3134 | 50 | 45 |
147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 46 | 3134 | 50 | 46 |
147345 rows × 17 columns
= 1
lag f'mbd_lag_{lag}'] = raw.groupby('cfips')['microbusiness_density'].shift(lag).bfill()
raw[f'dif_{lag}'] = (raw['microbusiness_density'] / raw[f'mbd_lag_{lag}']).fillna(1).clip(0, None) - 1
raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 0
raw.loc[(raw[f'microbusiness_density']>0) & (raw[f'mbd_lag_{lag}']==0), f'dif_{lag}'] = 1
raw.loc[(raw[f'dif_{lag}'] = raw[f'dif_{lag}'].abs()
raw[# raw.groupby('dcount')['dif'].sum().plot()
raw.tail()
index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | lat | state_i1 | county_i1 | dcount | county_i | state_i | scale | mbd_lag_1 | dif_1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 42 | 3134 | 50 | 42 | NaN | 0.0 |
147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 43 | 3134 | 50 | 43 | NaN | 0.0 |
147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 44 | 3134 | 50 | 44 | NaN | 0.0 |
147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 45 | 3134 | 50 | 45 | NaN | 0.0 |
147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | 43.840315 | NaN | NaN | 46 | 3134 | 50 | 46 | NaN | 0.0 |
'target'] = raw.groupby('cfips')['microbusiness_density'].shift(-1)
raw['target'] = raw['target']/raw['microbusiness_density'] - 1
raw[
'cfips']==28055, 'target'] = 0.0
raw.loc[raw['cfips']==48269, 'target'] = 0.0 raw.loc[raw[
'lastactive'] = raw.groupby('cfips')['active'].transform('last')
raw[
# dt = raw.loc[raw.dcount==40].groupby('cfips')['microbusiness_density'].agg('last')
# raw['lastactive'].clip(0, 8000).hist(bins=30)
raw
index | row_id | cfips | county | state | first_day_of_month | microbusiness_density | active | istest | lng | ... | state_i1 | county_i1 | dcount | county_i | state_i | scale | mbd_lag_1 | dif_1 | target | lastactive | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1001_2019-08-01 | 1001 | Autauga County | Alabama | 2019-08-01 | 2.856021 | 1249.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 0 | 0 | 0 | 0 | 2.856021 | 0.000000 | 0.010101 | 1475.0 |
1 | 1.0 | 1001_2019-09-01 | 1001 | Autauga County | Alabama | 2019-09-01 | 2.884870 | 1198.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 1 | 0 | 0 | 1 | 2.856021 | 0.010101 | 0.059265 | 1475.0 |
2 | 2.0 | 1001_2019-10-01 | 1001 | Autauga County | Alabama | 2019-10-01 | 3.055843 | 1269.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 2 | 0 | 0 | 2 | 2.884870 | 0.059265 | -0.020489 | 1475.0 |
3 | 3.0 | 1001_2019-11-01 | 1001 | Autauga County | Alabama | 2019-11-01 | 2.993233 | 1243.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 3 | 0 | 0 | 3 | 3.055843 | 0.020489 | 0.000000 | 1475.0 |
4 | 4.0 | 1001_2019-12-01 | 1001 | Autauga County | Alabama | 2019-12-01 | 2.993233 | 1243.0 | 0 | -86.642900 | ... | Alabama | Autauga County | 4 | 0 | 0 | 4 | 2.993233 | 0.000000 | -0.008066 | 1475.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
147340 | NaN | 56045_2023-02-01 | 56045 | Weston County | Wyoming | 2023-02-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 42 | 3134 | 50 | 42 | NaN | 0.000000 | NaN | 101.0 |
147341 | NaN | 56045_2023-03-01 | 56045 | Weston County | Wyoming | 2023-03-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 43 | 3134 | 50 | 43 | NaN | 0.000000 | NaN | 101.0 |
147342 | NaN | 56045_2023-04-01 | 56045 | Weston County | Wyoming | 2023-04-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 44 | 3134 | 50 | 44 | NaN | 0.000000 | NaN | 101.0 |
147343 | NaN | 56045_2023-05-01 | 56045 | Weston County | Wyoming | 2023-05-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 45 | 3134 | 50 | 45 | NaN | 0.000000 | NaN | 101.0 |
147344 | NaN | 56045_2023-06-01 | 56045 | Weston County | Wyoming | 2023-06-01 | NaN | NaN | 1 | -104.567404 | ... | NaN | NaN | 46 | 3134 | 50 | 46 | NaN | 0.000000 | NaN | 101.0 |
147345 rows × 21 columns
def build_features(raw, target='microbusiness_density', target_act='active_tmp', lags = 6):
= []
feats
for lag in range(1, lags):
f'mbd_lag_{lag}'] = raw.groupby('cfips')[target].shift(lag)
raw[f'act_lag_{lag}'] = raw.groupby('cfips')[target_act].diff(lag)
raw[f'mbd_lag_{lag}')
feats.append(f'act_lag_{lag}')
feats.append(
= 1
lag for window in [2, 4, 6, 8, 10]:
f'mbd_rollmea{window}_{lag}'] = raw.groupby('cfips')[f'mbd_lag_{lag}'].transform(lambda s: s.rolling(window, min_periods=1).sum())
raw[f'mbd_rollmea{window}_{lag}')
feats.append(
= list(census.columns)
census_columns "cfips")
census_columns.remove(
= raw.merge(census, on="cfips", how="left")
raw += census_columns
feats
= pd.read_csv("/kaggle/input/us-indicator/co-est2021-alldata.csv", encoding='latin-1')
co_est "cfips"] = co_est.STATE*1000 + co_est.COUNTY
co_est[= [
co_columns 'SUMLEV',
'DIVISION',
'ESTIMATESBASE2020',
'POPESTIMATE2020',
'POPESTIMATE2021',
'NPOPCHG2020',
'NPOPCHG2021',
'BIRTHS2020',
'BIRTHS2021',
'DEATHS2020',
'DEATHS2021',
'NATURALCHG2020',
'NATURALCHG2021',
'INTERNATIONALMIG2020',
'INTERNATIONALMIG2021',
'DOMESTICMIG2020',
'DOMESTICMIG2021',
'NETMIG2020',
'NETMIG2021',
'RESIDUAL2020',
'RESIDUAL2021',
'GQESTIMATESBASE2020',
'GQESTIMATES2020',
'GQESTIMATES2021',
'RBIRTH2021',
'RDEATH2021',
'RNATURALCHG2021',
'RINTERNATIONALMIG2021',
'RDOMESTICMIG2021',
'RNETMIG2021'
]= raw.merge(co_est, on="cfips", how="left")
raw += co_columns
feats return raw, feats
# Build Features based in lag of target
= build_features(raw, 'target', 'active', lags = 9)
raw, feats = ['state_i']
features += feats
features += ['lng','lat','scale']
features # print(features)
# raw.loc[raw.dcount==40, features].head(10)
Latitude and Longitude feature engineering from samu2505.
= raw[['lng', 'lat']].values
coordinates
# Encoding tricks
= 20
emb_size = 1e6
precision
= np.expand_dims(coordinates, axis=-1)
latlon
= np.exp(np.log(precision)/emb_size)
m = m ** np.arange(emb_size)
angle_freq = angle_freq.reshape(1,1, emb_size)
angle_freq = latlon * angle_freq
latlon 0::2] = np.cos(latlon[..., 0::2]) latlon[...,
def rot(df):
for angle in [15, 30, 45]:
f'rot_{angle}_x'] = (np.cos(np.radians(angle)) * df['lat']) + \
df[* df['lng'])
(np.sin(np.radians(angle))
f'rot_{angle}_y'] = (np.cos(np.radians(angle)) * df['lat']) - \
df[* df['lng'])
(np.sin(np.radians(angle))
return df
= rot(raw) raw
+= ['rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'rot_45_y'] features
def get_model():
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
# we should decrease the num_iterations of catboost
= cat.CatBoostRegressor(
cat_model =2000,
iterations="MAPE",
loss_function=0,
verbose='SymmetricTree',
grow_policy=0.035,
learning_rate=0.8,
colsample_bylevel=5,
max_depth=0.2,
l2_leaf_reg=0.70,
subsample=4096,
max_bin
)
return cat_model
def base_models():
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
#
# LGBM model
= {
params 'n_iter': 300,
'boosting_type': 'dart',
'verbosity': -1,
'objective': 'l1',
'random_state': 42,
'colsample_bytree': 0.8841279649367693,
'colsample_bynode': 0.10142964450634374,
'max_depth': 8,
'learning_rate': 0.003647749926797374,
'lambda_l2': 0.5,
'num_leaves': 61,
"seed": 42,
'min_data_in_leaf': 213}
= lgb.LGBMRegressor(**params)
lgb_model
= xgb.XGBRegressor(
xgb_model ='reg:pseudohubererror',
objective="hist",
tree_method=795,
n_estimators=0.0075,
learning_rate= 17,
max_leaves =0.50,
subsample=0.50,
colsample_bytree=4096,
max_bin=2)
n_jobs
# we should decrease the num_iterations of catboost
= cat.CatBoostRegressor(
cat_model =2000,
iterations="MAPE",
loss_function=0,
verbose='SymmetricTree',
grow_policy=0.035,
learning_rate=0.8,
colsample_bylevel=5,
max_depth=0.2,
l2_leaf_reg=0.70,
subsample=4096,
max_bin
)
= {}
models 'xgb'] = xgb_model
models['lgbm'] = lgb_model
models['cat'] = cat_model
models[
return models
= 150
ACT_THR = 20
MONTH_1 = 40
MONTH_last
'k'] = 1.
raw['microbusiness_density'].fillna(0, inplace = True)
raw[
= 39
TS print(f'TS: {TS}')
= (raw.istest==0) & (raw.dcount < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR)
train_indices = (raw.istest==0) & (raw.dcount == TS)
valid_indices
# model = get_model()
= base_models()
models
# Train each of the models on the current TS
= {}
lst_tr_pred = {}
lst_val_preds for key, model in models.items():
model.fit(
raw.loc[train_indices, features],'target'].clip(-0.002, 0.006))
raw.loc[train_indices, = model.predict(raw.loc[train_indices, features])
lst_tr_pred[key] = model.predict(raw.loc[valid_indices, features])
lst_val_preds[key]
= np.column_stack(tuple(lst_tr_pred.values()))
train_preds = np.column_stack(tuple(lst_val_preds.values()))
valid_preds
= get_model()
meta_model 'target'].clip(-0.002, 0.006))
meta_model.fit(train_preds, raw.loc[train_indices, = meta_model.predict(valid_preds)
ypred
#raw.loc[valid_indices, 'target'] = ypred
'k'] = ypred + 1
raw.loc[valid_indices, 'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']
raw.loc[valid_indices,
# Validate
= raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
lastval = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']
dt
= raw.loc[raw.dcount==(TS+1),
df 'cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
['pred'] = df['cfips'].map(dt)
df['lastval'] = df['cfips'].map(lastval)
df[
# df.loc[df['lastval'].isnull(), 'lastval'] = df.loc[df['lastval'].isnull(), 'microbusiness_density']
'lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
df.loc[df[
print('Last Value SMAPE:', smape(df['microbusiness_density'], df['lastval']) )
print('SMAPE:', smape(df['microbusiness_density'], df['pred']))
print()
= (raw.dcount > MONTH_1)&(raw.dcount <= MONTH_last) ind
TS: 39
/opt/conda/lib/python3.7/site-packages/lightgbm/engine.py:177: UserWarning: Found `n_iter` in params. Will use it instead of argument
_log_warning(f"Found `{alias}` in params. Will use it instead of argument")
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
Last Value SMAPE: 1.889206717018118
SMAPE: 1.8637258032256854
for i in range(6):
= TS+1
TS print(f'TS: {TS}')
= (raw.istest==0) & (raw.dcount < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR)
train_indices = (raw.dcount == TS)
valid_indices
= {}
lst_tr_pred = {}
lst_val_preds for key, model in models.items():
model.fit(
raw.loc[train_indices, features],'target'].clip(-0.002, 0.006))
raw.loc[train_indices, = model.predict(raw.loc[train_indices, features])
lst_tr_pred[key] = model.predict(raw.loc[valid_indices, features])
lst_val_preds[key]
= np.column_stack(tuple(lst_tr_pred.values()))
train_preds = np.column_stack(tuple(lst_val_preds.values()))
valid_preds
'target'].clip(-0.002, 0.006))
meta_model.fit(train_preds, raw.loc[train_indices, = meta_model.predict(valid_preds)
ypred
== TS), 'target'] = ypred
raw.loc[(raw.dcount # raw.loc[valid_indices, 'k'] = ypred + 1
# raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']
# #Validate
# lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
# dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']
# df = raw.loc[raw.dcount==(TS+1), ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
# df['pred'] = df['cfips'].map(dt)
# df['lastval'] = df['cfips'].map(lastval)
# df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
# print(df['pred'].values)
# raw.loc[raw.dcount==(TS+1), 'ypred'] = df['pred'].values
# raw.loc[raw.dcount==(TS+1), 'ypred_last'] = df['lastval'].values
== TS+1), 'microbusiness_density'] = (1+ypred)*np.array(raw.loc[(raw.dcount == TS), 'microbusiness_density']) raw.loc[(raw.dcount
TS: 40
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 41
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 42
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 43
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 44
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
TS: 45
[LightGBM] [Warning] lambda_l2 is set=0.5, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.5
[LightGBM] [Warning] min_data_in_leaf is set=213, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=213
[LightGBM] [Warning] seed is set=42, random_state=42 will be ignored. Current value: seed=42
[LightGBM] [Warning] num_iterations is set=300, n_iter=300 will be ignored. Current value: num_iterations=300
'row_id','istest','target','microbusiness_density','k']].head(10) raw[[
row_id | istest | target | microbusiness_density | k | |
---|---|---|---|---|---|
0 | 1001_2019-08-01 | 0 | 0.010101 | 2.856021 | 1.0 |
1 | 1001_2019-09-01 | 0 | 0.059265 | 2.884870 | 1.0 |
2 | 1001_2019-10-01 | 0 | -0.020489 | 3.055843 | 1.0 |
3 | 1001_2019-11-01 | 0 | 0.000000 | 2.993233 | 1.0 |
4 | 1001_2019-12-01 | 0 | -0.008066 | 2.993233 | 1.0 |
5 | 1001_2020-01-01 | 0 | -0.020129 | 2.969090 | 1.0 |
6 | 1001_2020-02-01 | 0 | 0.008217 | 2.909326 | 1.0 |
7 | 1001_2020-03-01 | 0 | 0.022820 | 2.933231 | 1.0 |
8 | 1001_2020-04-01 | 0 | 0.001594 | 3.000167 | 1.0 |
9 | 1001_2020-05-01 | 0 | 0.004773 | 3.004948 | 1.0 |
= raw[raw.first_day_of_month >= '2022-11-01'].copy()
test = test[['row_id', 'cfips', 'microbusiness_density']]
test = test[['row_id', 'microbusiness_density']]
test 'submission.csv', index=False)
test.to_csv(10) test.tail(
row_id | microbusiness_density | |
---|---|---|
147296 | 56043_2023-05-01 | 3.042164 |
147297 | 56043_2023-06-01 | 3.036500 |
147337 | 56045_2022-11-01 | 1.785395 |
147338 | 56045_2022-12-01 | 1.803249 |
147339 | 56045_2023-01-01 | 1.807518 |
147340 | 56045_2023-02-01 | 1.803671 |
147341 | 56045_2023-03-01 | 1.798988 |
147342 | 56045_2023-04-01 | 1.795154 |
147343 | 56045_2023-05-01 | 1.791809 |
147344 | 56045_2023-06-01 | 1.788477 |