[1]:

import pandas as pd
import numpy as np
import time
import normet
import matplotlib.pyplot as plt
import matplotlib
from pylab import savefig

[2]:

df=pd.read_csv(r'data/MY1.csv',parse_dates=['date'],index_col='date')

[3]:

df

[3]:

	O3	NO	NO2	NOXasNO2	SO2	CO	PM10	NV10	V10	PM2.5	...	AP10	AT2.5	AP2.5	site	code	latitude	longitude	location_type	Ox	NOx
date
2020-01-01 00:00:00	1.72961	78.38595	45.77784	165.96796	4.75424	NaN	69.0	60.0	9.0	58.1	...	1026.2	4.7	1025.2	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	23.960024	83.832703
2020-01-01 01:00:00	1.92918	88.61587	52.64325	188.51903	4.84394	0.397528	45.0	38.4	6.6	43.2	...	1026.2	4.6	1024.1	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	27.524407	95.223555
2020-01-01 02:00:00	1.99570	70.02935	44.76870	152.14554	3.09474	0.346417	46.2	39.1	7.1	43.0	...	1026.2	4.9	1024.1	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	23.579103	76.850791
2020-01-01 03:00:00	2.04559	67.58589	40.20699	143.83725	2.96019	0.335059	45.1	38.8	6.3	42.8	...	1026.2	4.6	1024.1	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	21.299094	72.654172
2020-01-01 04:00:00	2.99355	72.03298	47.26010	157.70912	3.83478	0.349257	40.8	34.2	6.6	36.8	...	1026.2	4.2	1024.1	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	25.320553	79.661030
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	20.70539	12.46950	30.47461	49.59424	1.12164	0.129904	13.9	11.8	2.1	11.7	...	1002.9	-0.1	1001.9	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	25.415693	25.050618
2020-12-31 20:00:00	24.14797	9.65279	26.51175	41.31249	1.12164	0.094180	14.6	11.3	3.3	11.0	...	1002.9	0.6	1001.9	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	25.080442	20.867401
2020-12-31 21:00:00	25.69464	12.46950	28.45232	47.57196	1.36199	0.087685	16.6	13.0	3.6	15.3	...	1003.9	0.8	1001.9	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	26.809326	24.029142
2020-12-31 22:00:00	26.39313	6.45629	25.05721	34.95672	0.88129	0.084437	19.1	16.0	3.1	17.1	...	1003.9	0.3	1002.9	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	25.432539	17.657011
2020-12-31 23:00:00	27.93980	6.61453	22.07004	32.21218	0.88129	0.087685	17.6	13.2	4.4	15.2	...	1003.9	0.3	1002.9	London Marylebone Road	MY1	51.52253	-0.154611	Urban Traffic	24.672379	16.270723

6373 rows × 53 columns

[4]:

era=pd.read_csv(r'data/MY1_era.csv',parse_dates=['date'],index_col='date')

[5]:

era

[5]:

	u10	v10	d2m	t2m	blh	sp	ssrd	tcc	tp	rh2m	lat	lon
date
2020-01-01 00:00:00	-2.720528	1.545010	277.183465	278.394725	384.209053	102252.303312	-1.164153e-10	0.650958	0.000008	91.884130	51.52253	-0.154611
2020-01-01 01:00:00	-2.308789	1.282742	276.695430	277.772899	353.220263	102211.168636	-1.164153e-10	0.603699	0.000002	92.715877	51.52253	-0.154611
2020-01-01 02:00:00	-2.216471	0.758730	276.505662	277.463419	255.911846	102174.855967	-1.164153e-10	0.710378	0.000005	93.485560	51.52253	-0.154611
2020-01-01 03:00:00	-1.928623	0.509013	276.412816	277.305813	191.375560	102166.786485	-1.164153e-10	0.837765	0.000005	93.906363	51.52253	-0.154611
2020-01-01 04:00:00	-1.700043	0.607069	276.553051	277.478941	151.780210	102142.578039	-1.164153e-10	0.819103	0.000003	93.696878	51.52253	-0.154611
...	...	...	...	...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	1.380939	-1.945195	272.197565	273.557442	476.945688	99902.506413	-5.820766e-11	0.918149	0.000000	90.582979	51.52253	-0.154611
2020-12-31 20:00:00	0.987925	-2.102732	272.171041	273.629146	486.665851	99947.625909	-5.820766e-11	0.839639	0.000000	89.939908	51.52253	-0.154611
2020-12-31 21:00:00	0.681543	-1.933668	272.087408	273.470592	489.355002	100000.215520	-5.820766e-11	0.739354	0.000000	90.422188	51.52253	-0.154611
2020-12-31 22:00:00	1.020793	-0.583816	272.235319	272.926062	40.714872	100042.844978	-5.820766e-11	0.643753	0.000000	95.088677	51.52253	-0.154611
2020-12-31 23:00:00	0.959517	-0.377511	272.020979	272.681367	55.617254	100053.601944	-5.820766e-11	0.549403	0.000000	95.290673	51.52253	-0.154611

8784 rows × 12 columns

[6]:

df1=pd.concat([df,era],axis=1)

AutoML-based weather normalisation

[7]:

df1a=normet.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)

[8]:

automl=normet.train_model(df1a,variables=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']);

2024-07-04 12:02:31 : Training AutoML...
2024-07-04 12:03:31 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}

[9]:

best_model = automl.best_estimator
best_config = automl.best_config
print("Best model:", best_model)
print("Best model parameters:", best_config)

Best model: xgboost
Best model parameters: {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}

[10]:

automl.best_result

[10]:

{'pred_time': 2.8282029858194133e-06,
 'wall_clock_time': 15.478726148605347,
 'metric_for_logging': {'pred_time': 2.8282029858194133e-06},
 'val_loss': 0.14506105728824534,
 'training_iteration': 1,
 'config': {'n_estimators': 80,
  'max_leaves': 179,
  'min_child_weight': 0.03469842703470486,
  'learning_rate': 0.09050333780681966,
  'subsample': 0.8788794424065157,
  'colsample_bylevel': 0.7372635897769984,
  'colsample_bytree': 0.9535003009271207,
  'reg_alpha': 0.09353166342028453,
  'reg_lambda': 3.6044671382295674},
 'config/n_estimators': 80,
 'config/max_leaves': 179,
 'config/min_child_weight': 0.03469842703470486,
 'config/learning_rate': 0.09050333780681966,
 'config/subsample': 0.8788794424065157,
 'config/colsample_bylevel': 0.7372635897769984,
 'config/colsample_bytree': 0.9535003009271207,
 'config/reg_alpha': 0.09353166342028453,
 'config/reg_lambda': 3.6044671382295674,
 'experiment_tag': 'exp',
 'time_total_s': 3.0076282024383545}

[11]:

automl.feature_importances_

[11]:

array([0.0512452 , 0.09356024, 0.05020848, 0.07993569, 0.0396647 ,
       0.21618505, 0.05815253, 0.02513707, 0.02636059, 0.03911485,
       0.02288253, 0.1130693 , 0.15781933, 0.02666452], dtype=float32)

[12]:

automl.feature_names_in_

[12]:

array(['weekday', 'u10', 'v10', 'd2m', 't2m', 'blh', 'sp', 'ssrd', 'tcc',
       'tp', 'rh2m', 'date_unix', 'day_julian', 'hour'], dtype='<U10')

[13]:

mod_stats=(pd.concat([normet.modStats(df1a,automl,set='testing'),
                normet.modStats(df1a,automl,set='training'),
                normet.modStats(df1a.assign(set="all"),automl,set='all')]))

[14]:

mod_stats

[14]:

n	FAC2	MB	MGE	NMB	NMGE	RMSE	r	p_level	COE	IOA	R2	set
1593	0.912116	0.065145	1.948816	0.007081	0.211829	2.796755	0.936845	***	0.645061	0.822530	0.877679	testing
4780	0.971967	-0.013397	0.655479	-0.001470	0.071933	0.868739	0.994758	***	0.880270	0.940135	0.989543	training
6373	0.957006	0.006235	0.978762	0.000683	0.107153	1.587833	0.981282	***	0.821371	0.910685	0.962915	all

[15]:

df1a

[15]:

	rowid	u10	d2m	rh2m	v10	ssrd	t2m	blh	tcc	sp	value	tp	date	date_unix	day_julian	weekday	hour	set
0	0	-2.720528	277.183465	91.884130	1.545010	-1.164153e-10	278.394725	384.209053	0.650958	102252.303312	58.1	0.000008	2020-01-01 00:00:00	1.577837e+09	1	3	0	training
1	1	-2.308789	276.695430	92.715877	1.282742	-1.164153e-10	277.772899	353.220263	0.603699	102211.168636	43.2	0.000002	2020-01-01 01:00:00	1.577840e+09	1	3	1	training
2	2	-2.216471	276.505662	93.485560	0.758730	-1.164153e-10	277.463419	255.911846	0.710378	102174.855967	43.0	0.000005	2020-01-01 02:00:00	1.577844e+09	1	3	2	testing
3	3	-1.928623	276.412816	93.906363	0.509013	-1.164153e-10	277.305813	191.375560	0.837765	102166.786485	42.8	0.000005	2020-01-01 03:00:00	1.577848e+09	1	3	3	training
4	4	-1.700043	276.553051	93.696878	0.607069	-1.164153e-10	277.478941	151.780210	0.819103	102142.578039	36.8	0.000003	2020-01-01 04:00:00	1.577851e+09	1	3	4	testing
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6368	6368	1.380939	272.197565	90.582979	-1.945195	-5.820766e-11	273.557442	476.945688	0.918149	99902.506413	11.7	0.000000	2020-12-31 19:00:00	1.609441e+09	366	4	19	training
6369	6369	0.987925	272.171041	89.939908	-2.102732	-5.820766e-11	273.629146	486.665851	0.839639	99947.625909	11.0	0.000000	2020-12-31 20:00:00	1.609445e+09	366	4	20	training
6370	6370	0.681543	272.087408	90.422188	-1.933668	-5.820766e-11	273.470592	489.355002	0.739354	100000.215520	15.3	0.000000	2020-12-31 21:00:00	1.609448e+09	366	4	21	testing
6371	6371	1.020793	272.235319	95.088677	-0.583816	-5.820766e-11	272.926062	40.714872	0.643753	100042.844978	17.1	0.000000	2020-12-31 22:00:00	1.609452e+09	366	4	22	training
6372	6372	0.959517	272.020979	95.290673	-0.377511	-5.820766e-11	272.681367	55.617254	0.549403	100053.601944	15.2	0.000000	2020-12-31 23:00:00	1.609456e+09	366	4	23	training

6373 rows × 18 columns

[16]:

start_time = time.time()
df_dew=normet.normalise(df1a, automl,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

2024-07-04 12:03:31 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:03:41 : Aggregating 1000 predictions...
Execution time: 9.66 seconds

[17]:

start_time = time.time()
df_dew1=normet.normalise(df1a, automl, weather_df=df1.loc['2020-01-01':'2020-04-01',:],
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=300,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

2024-07-04 12:03:41 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:03:44 : Aggregating 300 predictions...
Execution time: 2.67 seconds

[18]:

weather_df=df1.reset_index().iloc[0:100][['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m']]

[19]:

weather_df.head()

[19]:

	u10	v10	d2m	t2m	blh	sp	ssrd	tcc	tp	rh2m
0	-2.720528	1.545010	277.183465	278.394725	384.209053	102252.303312	-1.164153e-10	0.650958	0.000008	91.884130
1	-2.308789	1.282742	276.695430	277.772899	353.220263	102211.168636	-1.164153e-10	0.603699	0.000002	92.715877
2	-2.216471	0.758730	276.505662	277.463419	255.911846	102174.855967	-1.164153e-10	0.710378	0.000005	93.485560
3	-1.928623	0.509013	276.412816	277.305813	191.375560	102166.786485	-1.164153e-10	0.837765	0.000005	93.906363
4	-1.700043	0.607069	276.553051	277.478941	151.780210	102142.578039	-1.164153e-10	0.819103	0.000003	93.696878

[20]:

start_time = time.time()
df_dew2=normet.normalise(df1a, automl, weather_df=weather_df,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=300,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

2024-07-04 12:03:44 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:03:46 : Aggregating 300 predictions...
Execution time: 2.26 seconds

[21]:

df_dew1['normalised'].plot(c='r')
df_dew2['normalised'].plot(c='b')

[21]:

<Axes: xlabel='date'>

[22]:

model_config={
    'time_budget': 60,  # Total running time in seconds
    'metric': 'r2', #
}

[23]:

df1a.columns

[23]:

Index(['rowid', 'u10', 'd2m', 'rh2m', 'v10', 'ssrd', 't2m', 'blh', 'tcc', 'sp',
       'value', 'tp', 'date', 'date_unix', 'day_julian', 'weekday', 'hour',
       'set'],
      dtype='object')

[24]:

df_dew, mod_stats=normet.do_all(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)

2024-07-04 12:03:46 : Training AutoML...
2024-07-04 12:04:47 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:04:47 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:04:48 : Aggregating 100 predictions...

[25]:

df_dew, mod_stats=normet.do_all(df1a,automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)

2024-07-04 12:04:48 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:04:49 : Aggregating 100 predictions...

[26]:

df_dew, mod_stats=normet.do_all(df1a,automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],weather_df=weather_df,model_config=model_config,n_samples=100)

2024-07-04 12:04:49 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:04:49 : Aggregating 100 predictions...

[27]:

df_dew1['normalised'].plot(c='r')
df_dew2['normalised'].plot(c='b')
df_dew['normalised'].plot(c='y')

[27]:

<Axes: xlabel='date'>

[28]:

mod_stats

[28]:

	n	FAC2	MB	MGE	NMB	NMGE	RMSE	r	p_level	COE	IOA	R2	set
0	1593	0.912116	0.065145	1.948816	0.007081	0.211829	2.796755	0.936845	***	0.645061	0.822530	0.877679	testing
1	4780	0.971967	-0.013397	0.655479	-0.001470	0.071933	0.868739	0.994758	***	0.880270	0.940135	0.989543	training
2	6373	0.957006	0.006235	0.978762	0.000683	0.107153	1.587833	0.981282	***	0.821371	0.910685	0.962915	all

[29]:

df_dew, mod_stats=normet.do_all_unc(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],n_samples=100,n_models=5)

2024-07-04 12:05:52 : Progress: 20.00% (Model 1/5)... ETA: 4.14 minutes
2024-07-04 12:07:11 : Progress: 40.00% (Model 2/5)... ETA: 3.54 minutes
2024-07-04 12:08:14 : Progress: 60.00% (Model 3/5)... ETA: 2.27 minutes
2024-07-04 12:09:32 : Progress: 80.00% (Model 4/5)... ETA: 1.18 minutes
2024-07-04 12:10:32 : Progress: 100.00% (Model 5/5)... ETA: 0.00 seconds

[30]:

#Resampling from given dataset
df_dew, mod_stats=normet.do_all(df1,value='PM2.5',weather_df=df1.loc['2020-01-01':'2020-04-01',:],feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=300)

2024-07-04 12:10:32 : Training AutoML...
2024-07-04 12:11:33 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:11:33 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:11:36 : Aggregating 300 predictions...

[31]:

df_dew.head()

[31]:

	observed	normalised
date
2020-01-01 00:00:00	58.1	22.036427
2020-01-01 01:00:00	43.2	20.604290
2020-01-01 02:00:00	43.0	20.541487
2020-01-01 03:00:00	42.8	20.982031
2020-01-01 04:00:00	36.8	20.209471

Time series decomposition

[32]:

df_dewca, mod_stats=normet.decom_emi(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], split_method = 'random', fraction=0.75, n_samples=300)

2024-07-04 12:11:36 : Training AutoML...
2024-07-04 12:12:36 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:12:36 : Subtracting base...
2024-07-04 12:12:39 : Subtracting date_unix... ETA: 11.40 seconds
2024-07-04 12:12:42 : Subtracting day_julian... ETA: 8.19 seconds
2024-07-04 12:12:45 : Subtracting weekday... ETA: 5.41 seconds
2024-07-04 12:12:47 : Subtracting hour... ETA: 2.68 seconds

[33]:

df_dewca

[33]:

	observed	base	date_unix	day_julian	weekday	hour	deweathered	emi_noise
date
2020-01-01 00:00:00	58.1	9.818041	20.604118	1.449459	0.553579	-0.750149	22.521753	0.664746
2020-01-01 01:00:00	43.2	9.692105	21.145397	1.942423	0.545198	-0.448908	23.722921	0.538811
2020-01-01 02:00:00	43.0	8.911392	21.416718	1.732838	0.660833	-0.588243	22.980244	-0.241902
2020-01-01 03:00:00	42.8	8.811673	21.317305	1.873583	0.566668	-0.807030	22.608904	-0.341621
2020-01-01 04:00:00	36.8	8.689767	21.724403	1.793306	0.571714	-0.710558	22.915339	-0.463528
...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	11.7	8.585531	11.187433	1.018248	0.065087	0.030320	11.733325	-0.567763
2020-12-31 20:00:00	11.0	10.310694	9.855659	0.788847	0.049502	-0.077022	11.774386	1.157399
2020-12-31 21:00:00	15.3	8.988753	10.415834	1.015028	0.096924	-0.112138	11.251107	-0.164541
2020-12-31 22:00:00	17.1	9.330045	10.346726	0.955586	0.104383	-0.266882	11.316565	0.176750
2020-12-31 23:00:00	15.2	9.275574	10.362161	0.748316	0.091792	-0.295992	11.028556	0.122279

6373 rows × 8 columns

[34]:

df_dewca, mod_stats=normet.decom_emi(df1a, model=automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)

2024-07-04 12:12:50 : Subtracting base...
2024-07-04 12:12:52 : Subtracting date_unix... ETA: 10.73 seconds
2024-07-04 12:12:55 : Subtracting day_julian... ETA: 7.97 seconds
2024-07-04 12:12:58 : Subtracting weekday... ETA: 5.31 seconds
2024-07-04 12:13:00 : Subtracting hour... ETA: 2.64 seconds

[35]:

df_dewcb, mod_stats=normet.decom_met(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)

2024-07-04 12:13:03 : Training AutoML...
2024-07-04 12:14:03 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:14:04 : Subtracting deweathered...
2024-07-04 12:14:06 : Subtracting blh... ETA: 26.09 seconds
2024-07-04 12:14:09 : Subtracting u10... ETA: 23.09 seconds
2024-07-04 12:14:11 : Subtracting d2m... ETA: 20.13 seconds
2024-07-04 12:14:13 : Subtracting sp... ETA: 17.28 seconds
2024-07-04 12:14:16 : Subtracting v10... ETA: 14.58 seconds
2024-07-04 12:14:18 : Subtracting t2m... ETA: 11.93 seconds
2024-07-04 12:14:20 : Subtracting tp... ETA: 9.41 seconds
2024-07-04 12:14:22 : Subtracting tcc... ETA: 6.94 seconds
2024-07-04 12:14:24 : Subtracting ssrd... ETA: 4.55 seconds
2024-07-04 12:14:26 : Subtracting rh2m... ETA: 2.24 seconds

[36]:

df_dewcb, mod_stats=normet.decom_met(df1a, model=automl, feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)

2024-07-04 12:14:28 : Subtracting deweathered...
2024-07-04 12:14:30 : Subtracting blh... ETA: 26.35 seconds
2024-07-04 12:14:33 : Subtracting u10... ETA: 22.99 seconds
2024-07-04 12:14:35 : Subtracting d2m... ETA: 20.03 seconds
2024-07-04 12:14:38 : Subtracting sp... ETA: 17.18 seconds
2024-07-04 12:14:40 : Subtracting v10... ETA: 14.50 seconds
2024-07-04 12:14:42 : Subtracting t2m... ETA: 11.98 seconds
2024-07-04 12:14:44 : Subtracting tp... ETA: 9.42 seconds
2024-07-04 12:14:46 : Subtracting tcc... ETA: 6.97 seconds
2024-07-04 12:14:48 : Subtracting ssrd... ETA: 4.57 seconds
2024-07-04 12:14:50 : Subtracting rh2m... ETA: 2.25 seconds

[37]:

df_dewcb

[37]:

	observed	deweathered	blh	u10	d2m	sp	v10	t2m	tp	tcc	ssrd	rh2m	met_noise
date
2020-01-01 00:00:00	58.1	22.521753	3.105356	15.690435	13.370102	3.032124	6.203915	5.311699	2.730858	1.852100	0.520401	0.291279	9.400610
2020-01-01 01:00:00	43.2	23.722921	1.032499	10.332563	10.146141	3.130753	5.301819	4.798836	2.912220	2.052608	1.157616	-0.196972	-0.640710
2020-01-01 02:00:00	43.0	22.980244	0.215631	6.851231	8.113451	3.094269	4.130383	4.713776	3.276894	2.710617	2.732285	1.285332	1.364532
2020-01-01 03:00:00	42.8	22.608904	-0.558086	5.456263	8.559235	3.648994	3.184381	4.667519	3.397984	2.791050	3.348179	1.633690	1.993581
2020-01-01 04:00:00	36.8	22.915339	-0.296980	1.798401	4.787289	3.438236	1.801090	3.431309	2.871231	1.892057	2.969601	2.141823	1.182835
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	11.7	11.733325	-0.629538	-0.715612	-0.015411	0.308806	0.392364	0.891096	0.799346	-0.003299	-0.062504	-0.241484	-0.272833
2020-12-31 20:00:00	11.0	11.774386	-0.537662	-0.538769	-0.091031	0.148932	0.305520	0.786525	0.761405	-0.405293	-0.437243	-0.193840	-0.571940
2020-12-31 21:00:00	15.3	11.251107	-0.243903	-0.258332	0.046582	0.338228	0.362024	0.831674	0.803146	-0.466408	-0.422866	-0.049072	3.652803
2020-12-31 22:00:00	17.1	11.316565	4.063190	2.834073	-1.142671	2.329886	2.193111	-0.082521	0.252815	0.015556	-0.009989	0.315748	0.370693
2020-12-31 23:00:00	15.2	11.028556	4.984342	2.931353	-2.154545	2.283448	2.286663	-0.245660	0.163794	0.237981	0.146729	-0.258326	-0.777352

6373 rows × 13 columns

[38]:

df_dewca, mod_stats=normet.decom_emi(df1a, model=automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)

2024-07-04 12:14:52 : Subtracting base...
2024-07-04 12:14:55 : Subtracting date_unix... ETA: 10.64 seconds
2024-07-04 12:14:57 : Subtracting day_julian... ETA: 8.06 seconds
2024-07-04 12:15:00 : Subtracting weekday... ETA: 5.31 seconds
2024-07-04 12:15:03 : Subtracting hour... ETA: 2.64 seconds

[39]:

df_dewca

[39]:

	observed	base	date_unix	day_julian	weekday	hour	deweathered	emi_noise
date
2020-01-01 00:00:00	58.1	9.635087	20.808069	1.449459	0.553579	-0.750149	22.521753	0.460794
2020-01-01 01:00:00	43.2	9.427952	21.430550	1.942423	0.545198	-0.448908	23.722921	0.253659
2020-01-01 02:00:00	43.0	9.077608	21.271500	1.732838	0.660833	-0.588243	22.980244	-0.096684
2020-01-01 03:00:00	42.8	8.548481	21.601494	1.873583	0.566668	-0.807030	22.608904	-0.625812
2020-01-01 04:00:00	36.8	8.893300	21.541868	1.793306	0.571714	-0.710558	22.915339	-0.280993
...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	11.7	9.192358	10.601604	1.018248	0.065087	0.030320	11.733325	0.018065
2020-12-31 20:00:00	11.0	9.905566	10.281785	0.788847	0.049502	-0.077022	11.774386	0.731274
2020-12-31 21:00:00	15.3	8.519252	10.906334	1.015028	0.096924	-0.112138	11.251107	-0.655041
2020-12-31 22:00:00	17.1	8.960856	10.736913	0.955586	0.104383	-0.266882	11.316565	-0.213436
2020-12-31 23:00:00	15.2	9.216323	10.442410	0.748316	0.091792	-0.295992	11.028556	0.042030

6373 rows × 8 columns

Rolling weather normalisation

[40]:

df_dewc1, mod_stats=normet.rolling_dew(df1a, automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], n_samples=100,window_days=14, rollingevery=7)

2024-07-04 12:15:06 : Rolling window 0 from 2020-01-01 to 2020-01-15
2024-07-04 12:15:08 : Rolling window 10 from 2020-03-13 to 2020-03-27 ETA: 7.84 seconds
2024-07-04 12:15:11 : Rolling window 20 from 2020-05-22 to 2020-06-05 ETA: 5.14 seconds
2024-07-04 12:15:14 : Rolling window 30 from 2020-08-02 to 2020-08-16 ETA: 2.45 seconds

[41]:

df_dewc1.head()

[41]:

	observed	rolling_0	rolling_1	rolling_2	rolling_3	rolling_4	rolling_5	rolling_6	rolling_7	rolling_8	...	rolling_29	rolling_30	rolling_31	rolling_32	rolling_33	rolling_34	rolling_35	rolling_36	rolling_37	rolling_38
date
2020-01-01 00:00:00	58.1	16.412506	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 01:00:00	43.2	17.619698	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 02:00:00	43.0	16.589432	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 03:00:00	42.8	16.524757	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 04:00:00	36.8	17.865799	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 40 columns

[42]:

df_dewc1.iloc[:,1:].plot()

[42]:

<Axes: xlabel='date'>

[43]:

df_dewc2, mod_stats=normet.rolling_met(df1a, automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=100,window_days=14,rollingevery=7,fraction=0.75, seed=7654321)

2024-07-04 12:15:18 : Rolling window 0 from 2020-01-01 to 2020-01-15
2024-07-04 12:15:21 : Rolling window 10 from 2020-03-13 to 2020-03-27 ETA: 7.85 seconds
2024-07-04 12:15:23 : Rolling window 20 from 2020-05-22 to 2020-06-05 ETA: 5.16 seconds
2024-07-04 12:15:26 : Rolling window 30 from 2020-08-02 to 2020-08-16 ETA: 2.46 seconds

[44]:

df_dewc2.head()

[44]:

	observed	normalised	emi_mean_14	emi_std_14	met_short	met_season
date
2020-01-01 00:00:00	58.1	22.331083	NaN	NaN	NaN	NaN
2020-01-01 01:00:00	43.2	23.972191	NaN	NaN	NaN	NaN
2020-01-01 02:00:00	43.0	22.342634	NaN	NaN	NaN	NaN
2020-01-01 03:00:00	42.8	22.705828	NaN	NaN	NaN	NaN
2020-01-01 04:00:00	36.8	22.325943	NaN	NaN	NaN	NaN

[45]:

df_dewc2['met_short'].plot()

[45]:

<Axes: xlabel='date'>

[46]:

df_dewc2['met_season'].plot()

[46]:

<Axes: xlabel='date'>

Partial Dependence Plots

[47]:

df1a=normet.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)

[48]:

df1a

[48]:

	rowid	u10	d2m	rh2m	v10	ssrd	t2m	blh	tcc	sp	value	tp	date	date_unix	day_julian	weekday	hour	set
0	0	-2.720528	277.183465	91.884130	1.545010	-1.164153e-10	278.394725	384.209053	0.650958	102252.303312	58.1	0.000008	2020-01-01 00:00:00	1.577837e+09	1	3	0	training
1	1	-2.308789	276.695430	92.715877	1.282742	-1.164153e-10	277.772899	353.220263	0.603699	102211.168636	43.2	0.000002	2020-01-01 01:00:00	1.577840e+09	1	3	1	training
2	2	-2.216471	276.505662	93.485560	0.758730	-1.164153e-10	277.463419	255.911846	0.710378	102174.855967	43.0	0.000005	2020-01-01 02:00:00	1.577844e+09	1	3	2	testing
3	3	-1.928623	276.412816	93.906363	0.509013	-1.164153e-10	277.305813	191.375560	0.837765	102166.786485	42.8	0.000005	2020-01-01 03:00:00	1.577848e+09	1	3	3	training
4	4	-1.700043	276.553051	93.696878	0.607069	-1.164153e-10	277.478941	151.780210	0.819103	102142.578039	36.8	0.000003	2020-01-01 04:00:00	1.577851e+09	1	3	4	testing
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6368	6368	1.380939	272.197565	90.582979	-1.945195	-5.820766e-11	273.557442	476.945688	0.918149	99902.506413	11.7	0.000000	2020-12-31 19:00:00	1.609441e+09	366	4	19	training
6369	6369	0.987925	272.171041	89.939908	-2.102732	-5.820766e-11	273.629146	486.665851	0.839639	99947.625909	11.0	0.000000	2020-12-31 20:00:00	1.609445e+09	366	4	20	training
6370	6370	0.681543	272.087408	90.422188	-1.933668	-5.820766e-11	273.470592	489.355002	0.739354	100000.215520	15.3	0.000000	2020-12-31 21:00:00	1.609448e+09	366	4	21	testing
6371	6371	1.020793	272.235319	95.088677	-0.583816	-5.820766e-11	272.926062	40.714872	0.643753	100042.844978	17.1	0.000000	2020-12-31 22:00:00	1.609452e+09	366	4	22	training
6372	6372	0.959517	272.020979	95.290673	-0.377511	-5.820766e-11	272.681367	55.617254	0.549403	100053.601944	15.2	0.000000	2020-12-31 23:00:00	1.609456e+09	366	4	23	training

6373 rows × 18 columns

[49]:

all_features=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']
pdp_value=normet.pdp_all(automl,df1a,feature_names=all_features)

[50]:

pdp_value

[50]:

	variable	value	pdp_mean	pdp_std
0	u10	-4.354032	10.963994	7.261466
1	u10	-4.242993	11.043289	7.309947
2	u10	-4.131954	11.042447	7.333392
3	u10	-4.020915	11.031183	7.338155
4	u10	-3.909875	11.040386	7.363017
...	...	...	...	...
1226	hour	19.000000	9.639515	7.460504
1227	hour	20.000000	9.578510	7.448645
1228	hour	21.000000	9.388046	7.372877
1229	hour	22.000000	9.182877	7.346088
1230	hour	23.000000	8.964516	7.354586

1231 rows × 4 columns

[ ]: