[1]:
import pandas as pd
import numpy as np
import time
import normet
import matplotlib.pyplot as plt
import matplotlib
from pylab import savefig
[2]:
df=pd.read_csv(r'data/MY1.csv',parse_dates=['date'],index_col='date')
[3]:
df
[3]:
O3 NO NO2 NOXasNO2 SO2 CO PM10 NV10 V10 PM2.5 ... AP10 AT2.5 AP2.5 site code latitude longitude location_type Ox NOx
date
2020-01-01 00:00:00 1.72961 78.38595 45.77784 165.96796 4.75424 NaN 69.0 60.0 9.0 58.1 ... 1026.2 4.7 1025.2 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 23.960024 83.832703
2020-01-01 01:00:00 1.92918 88.61587 52.64325 188.51903 4.84394 0.397528 45.0 38.4 6.6 43.2 ... 1026.2 4.6 1024.1 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 27.524407 95.223555
2020-01-01 02:00:00 1.99570 70.02935 44.76870 152.14554 3.09474 0.346417 46.2 39.1 7.1 43.0 ... 1026.2 4.9 1024.1 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 23.579103 76.850791
2020-01-01 03:00:00 2.04559 67.58589 40.20699 143.83725 2.96019 0.335059 45.1 38.8 6.3 42.8 ... 1026.2 4.6 1024.1 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 21.299094 72.654172
2020-01-01 04:00:00 2.99355 72.03298 47.26010 157.70912 3.83478 0.349257 40.8 34.2 6.6 36.8 ... 1026.2 4.2 1024.1 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 25.320553 79.661030
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 20.70539 12.46950 30.47461 49.59424 1.12164 0.129904 13.9 11.8 2.1 11.7 ... 1002.9 -0.1 1001.9 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 25.415693 25.050618
2020-12-31 20:00:00 24.14797 9.65279 26.51175 41.31249 1.12164 0.094180 14.6 11.3 3.3 11.0 ... 1002.9 0.6 1001.9 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 25.080442 20.867401
2020-12-31 21:00:00 25.69464 12.46950 28.45232 47.57196 1.36199 0.087685 16.6 13.0 3.6 15.3 ... 1003.9 0.8 1001.9 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 26.809326 24.029142
2020-12-31 22:00:00 26.39313 6.45629 25.05721 34.95672 0.88129 0.084437 19.1 16.0 3.1 17.1 ... 1003.9 0.3 1002.9 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 25.432539 17.657011
2020-12-31 23:00:00 27.93980 6.61453 22.07004 32.21218 0.88129 0.087685 17.6 13.2 4.4 15.2 ... 1003.9 0.3 1002.9 London Marylebone Road MY1 51.52253 -0.154611 Urban Traffic 24.672379 16.270723

6373 rows × 53 columns

[4]:
era=pd.read_csv(r'data/MY1_era.csv',parse_dates=['date'],index_col='date')
[5]:
era
[5]:
u10 v10 d2m t2m blh sp ssrd tcc tp rh2m lat lon
date
2020-01-01 00:00:00 -2.720528 1.545010 277.183465 278.394725 384.209053 102252.303312 -1.164153e-10 0.650958 0.000008 91.884130 51.52253 -0.154611
2020-01-01 01:00:00 -2.308789 1.282742 276.695430 277.772899 353.220263 102211.168636 -1.164153e-10 0.603699 0.000002 92.715877 51.52253 -0.154611
2020-01-01 02:00:00 -2.216471 0.758730 276.505662 277.463419 255.911846 102174.855967 -1.164153e-10 0.710378 0.000005 93.485560 51.52253 -0.154611
2020-01-01 03:00:00 -1.928623 0.509013 276.412816 277.305813 191.375560 102166.786485 -1.164153e-10 0.837765 0.000005 93.906363 51.52253 -0.154611
2020-01-01 04:00:00 -1.700043 0.607069 276.553051 277.478941 151.780210 102142.578039 -1.164153e-10 0.819103 0.000003 93.696878 51.52253 -0.154611
... ... ... ... ... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 1.380939 -1.945195 272.197565 273.557442 476.945688 99902.506413 -5.820766e-11 0.918149 0.000000 90.582979 51.52253 -0.154611
2020-12-31 20:00:00 0.987925 -2.102732 272.171041 273.629146 486.665851 99947.625909 -5.820766e-11 0.839639 0.000000 89.939908 51.52253 -0.154611
2020-12-31 21:00:00 0.681543 -1.933668 272.087408 273.470592 489.355002 100000.215520 -5.820766e-11 0.739354 0.000000 90.422188 51.52253 -0.154611
2020-12-31 22:00:00 1.020793 -0.583816 272.235319 272.926062 40.714872 100042.844978 -5.820766e-11 0.643753 0.000000 95.088677 51.52253 -0.154611
2020-12-31 23:00:00 0.959517 -0.377511 272.020979 272.681367 55.617254 100053.601944 -5.820766e-11 0.549403 0.000000 95.290673 51.52253 -0.154611

8784 rows × 12 columns

[6]:
df1=pd.concat([df,era],axis=1)

AutoML-based weather normalisation

[7]:
df1a=normet.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)
[8]:
automl=normet.train_model(df1a,variables=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']);
2024-07-04 12:02:31 : Training AutoML...
2024-07-04 12:03:31 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
[9]:
best_model = automl.best_estimator
best_config = automl.best_config
print("Best model:", best_model)
print("Best model parameters:", best_config)

Best model: xgboost
Best model parameters: {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
[10]:
automl.best_result
[10]:
{'pred_time': 2.8282029858194133e-06,
 'wall_clock_time': 15.478726148605347,
 'metric_for_logging': {'pred_time': 2.8282029858194133e-06},
 'val_loss': 0.14506105728824534,
 'training_iteration': 1,
 'config': {'n_estimators': 80,
  'max_leaves': 179,
  'min_child_weight': 0.03469842703470486,
  'learning_rate': 0.09050333780681966,
  'subsample': 0.8788794424065157,
  'colsample_bylevel': 0.7372635897769984,
  'colsample_bytree': 0.9535003009271207,
  'reg_alpha': 0.09353166342028453,
  'reg_lambda': 3.6044671382295674},
 'config/n_estimators': 80,
 'config/max_leaves': 179,
 'config/min_child_weight': 0.03469842703470486,
 'config/learning_rate': 0.09050333780681966,
 'config/subsample': 0.8788794424065157,
 'config/colsample_bylevel': 0.7372635897769984,
 'config/colsample_bytree': 0.9535003009271207,
 'config/reg_alpha': 0.09353166342028453,
 'config/reg_lambda': 3.6044671382295674,
 'experiment_tag': 'exp',
 'time_total_s': 3.0076282024383545}
[11]:
automl.feature_importances_
[11]:
array([0.0512452 , 0.09356024, 0.05020848, 0.07993569, 0.0396647 ,
       0.21618505, 0.05815253, 0.02513707, 0.02636059, 0.03911485,
       0.02288253, 0.1130693 , 0.15781933, 0.02666452], dtype=float32)
[12]:
automl.feature_names_in_
[12]:
array(['weekday', 'u10', 'v10', 'd2m', 't2m', 'blh', 'sp', 'ssrd', 'tcc',
       'tp', 'rh2m', 'date_unix', 'day_julian', 'hour'], dtype='<U10')
[13]:
mod_stats=(pd.concat([normet.modStats(df1a,automl,set='testing'),
                normet.modStats(df1a,automl,set='training'),
                normet.modStats(df1a.assign(set="all"),automl,set='all')]))
[14]:
mod_stats
[14]:
n FAC2 MB MGE NMB NMGE RMSE r p_level COE IOA R2 set
0 1593 0.912116 0.065145 1.948816 0.007081 0.211829 2.796755 0.936845 *** 0.645061 0.822530 0.877679 testing
0 4780 0.971967 -0.013397 0.655479 -0.001470 0.071933 0.868739 0.994758 *** 0.880270 0.940135 0.989543 training
0 6373 0.957006 0.006235 0.978762 0.000683 0.107153 1.587833 0.981282 *** 0.821371 0.910685 0.962915 all
[15]:
df1a
[15]:
rowid u10 d2m rh2m v10 ssrd t2m blh tcc sp value tp date date_unix day_julian weekday hour set
0 0 -2.720528 277.183465 91.884130 1.545010 -1.164153e-10 278.394725 384.209053 0.650958 102252.303312 58.1 0.000008 2020-01-01 00:00:00 1.577837e+09 1 3 0 training
1 1 -2.308789 276.695430 92.715877 1.282742 -1.164153e-10 277.772899 353.220263 0.603699 102211.168636 43.2 0.000002 2020-01-01 01:00:00 1.577840e+09 1 3 1 training
2 2 -2.216471 276.505662 93.485560 0.758730 -1.164153e-10 277.463419 255.911846 0.710378 102174.855967 43.0 0.000005 2020-01-01 02:00:00 1.577844e+09 1 3 2 testing
3 3 -1.928623 276.412816 93.906363 0.509013 -1.164153e-10 277.305813 191.375560 0.837765 102166.786485 42.8 0.000005 2020-01-01 03:00:00 1.577848e+09 1 3 3 training
4 4 -1.700043 276.553051 93.696878 0.607069 -1.164153e-10 277.478941 151.780210 0.819103 102142.578039 36.8 0.000003 2020-01-01 04:00:00 1.577851e+09 1 3 4 testing
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6368 6368 1.380939 272.197565 90.582979 -1.945195 -5.820766e-11 273.557442 476.945688 0.918149 99902.506413 11.7 0.000000 2020-12-31 19:00:00 1.609441e+09 366 4 19 training
6369 6369 0.987925 272.171041 89.939908 -2.102732 -5.820766e-11 273.629146 486.665851 0.839639 99947.625909 11.0 0.000000 2020-12-31 20:00:00 1.609445e+09 366 4 20 training
6370 6370 0.681543 272.087408 90.422188 -1.933668 -5.820766e-11 273.470592 489.355002 0.739354 100000.215520 15.3 0.000000 2020-12-31 21:00:00 1.609448e+09 366 4 21 testing
6371 6371 1.020793 272.235319 95.088677 -0.583816 -5.820766e-11 272.926062 40.714872 0.643753 100042.844978 17.1 0.000000 2020-12-31 22:00:00 1.609452e+09 366 4 22 training
6372 6372 0.959517 272.020979 95.290673 -0.377511 -5.820766e-11 272.681367 55.617254 0.549403 100053.601944 15.2 0.000000 2020-12-31 23:00:00 1.609456e+09 366 4 23 training

6373 rows × 18 columns

[16]:
start_time = time.time()
df_dew=normet.normalise(df1a, automl,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")
2024-07-04 12:03:31 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:03:41 : Aggregating 1000 predictions...
Execution time: 9.66 seconds
[17]:
start_time = time.time()
df_dew1=normet.normalise(df1a, automl, weather_df=df1.loc['2020-01-01':'2020-04-01',:],
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=300,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")
2024-07-04 12:03:41 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:03:44 : Aggregating 300 predictions...
Execution time: 2.67 seconds
[18]:
weather_df=df1.reset_index().iloc[0:100][['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m']]
[19]:
weather_df.head()
[19]:
u10 v10 d2m t2m blh sp ssrd tcc tp rh2m
0 -2.720528 1.545010 277.183465 278.394725 384.209053 102252.303312 -1.164153e-10 0.650958 0.000008 91.884130
1 -2.308789 1.282742 276.695430 277.772899 353.220263 102211.168636 -1.164153e-10 0.603699 0.000002 92.715877
2 -2.216471 0.758730 276.505662 277.463419 255.911846 102174.855967 -1.164153e-10 0.710378 0.000005 93.485560
3 -1.928623 0.509013 276.412816 277.305813 191.375560 102166.786485 -1.164153e-10 0.837765 0.000005 93.906363
4 -1.700043 0.607069 276.553051 277.478941 151.780210 102142.578039 -1.164153e-10 0.819103 0.000003 93.696878
[20]:
start_time = time.time()
df_dew2=normet.normalise(df1a, automl, weather_df=weather_df,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=300,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")
2024-07-04 12:03:44 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:03:46 : Aggregating 300 predictions...
Execution time: 2.26 seconds
[21]:
df_dew1['normalised'].plot(c='r')
df_dew2['normalised'].plot(c='b')
[21]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_21_1.png
[22]:
model_config={
    'time_budget': 60,  # Total running time in seconds
    'metric': 'r2', #
}
[23]:
df1a.columns
[23]:
Index(['rowid', 'u10', 'd2m', 'rh2m', 'v10', 'ssrd', 't2m', 'blh', 'tcc', 'sp',
       'value', 'tp', 'date', 'date_unix', 'day_julian', 'weekday', 'hour',
       'set'],
      dtype='object')
[24]:
df_dew, mod_stats=normet.do_all(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)
2024-07-04 12:03:46 : Training AutoML...
2024-07-04 12:04:47 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:04:47 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:04:48 : Aggregating 100 predictions...
[25]:
df_dew, mod_stats=normet.do_all(df1a,automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)
2024-07-04 12:04:48 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:04:49 : Aggregating 100 predictions...
[26]:
df_dew, mod_stats=normet.do_all(df1a,automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],weather_df=weather_df,model_config=model_config,n_samples=100)
2024-07-04 12:04:49 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:04:49 : Aggregating 100 predictions...
[27]:
df_dew1['normalised'].plot(c='r')
df_dew2['normalised'].plot(c='b')
df_dew['normalised'].plot(c='y')
[27]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_27_1.png
[28]:
mod_stats
[28]:
n FAC2 MB MGE NMB NMGE RMSE r p_level COE IOA R2 set
0 1593 0.912116 0.065145 1.948816 0.007081 0.211829 2.796755 0.936845 *** 0.645061 0.822530 0.877679 testing
1 4780 0.971967 -0.013397 0.655479 -0.001470 0.071933 0.868739 0.994758 *** 0.880270 0.940135 0.989543 training
2 6373 0.957006 0.006235 0.978762 0.000683 0.107153 1.587833 0.981282 *** 0.821371 0.910685 0.962915 all
[29]:
df_dew, mod_stats=normet.do_all_unc(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],n_samples=100,n_models=5)
2024-07-04 12:05:52 : Progress: 20.00% (Model 1/5)... ETA: 4.14 minutes
2024-07-04 12:07:11 : Progress: 40.00% (Model 2/5)... ETA: 3.54 minutes
2024-07-04 12:08:14 : Progress: 60.00% (Model 3/5)... ETA: 2.27 minutes
2024-07-04 12:09:32 : Progress: 80.00% (Model 4/5)... ETA: 1.18 minutes
2024-07-04 12:10:32 : Progress: 100.00% (Model 5/5)... ETA: 0.00 seconds
[30]:
#Resampling from given dataset
df_dew, mod_stats=normet.do_all(df1,value='PM2.5',weather_df=df1.loc['2020-01-01':'2020-04-01',:],feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=300)
2024-07-04 12:10:32 : Training AutoML...
2024-07-04 12:11:33 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:11:33 : Normalising the dataset using the trained model in parallel.
2024-07-04 12:11:36 : Aggregating 300 predictions...
[31]:
df_dew.head()
[31]:
observed normalised
date
2020-01-01 00:00:00 58.1 22.036427
2020-01-01 01:00:00 43.2 20.604290
2020-01-01 02:00:00 43.0 20.541487
2020-01-01 03:00:00 42.8 20.982031
2020-01-01 04:00:00 36.8 20.209471

Time series decomposition

[32]:
df_dewca, mod_stats=normet.decom_emi(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], split_method = 'random', fraction=0.75, n_samples=300)
2024-07-04 12:11:36 : Training AutoML...
2024-07-04 12:12:36 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:12:36 : Subtracting base...
2024-07-04 12:12:39 : Subtracting date_unix... ETA: 11.40 seconds
2024-07-04 12:12:42 : Subtracting day_julian... ETA: 8.19 seconds
2024-07-04 12:12:45 : Subtracting weekday... ETA: 5.41 seconds
2024-07-04 12:12:47 : Subtracting hour... ETA: 2.68 seconds
[33]:
df_dewca
[33]:
observed base date_unix day_julian weekday hour deweathered emi_noise
date
2020-01-01 00:00:00 58.1 9.818041 20.604118 1.449459 0.553579 -0.750149 22.521753 0.664746
2020-01-01 01:00:00 43.2 9.692105 21.145397 1.942423 0.545198 -0.448908 23.722921 0.538811
2020-01-01 02:00:00 43.0 8.911392 21.416718 1.732838 0.660833 -0.588243 22.980244 -0.241902
2020-01-01 03:00:00 42.8 8.811673 21.317305 1.873583 0.566668 -0.807030 22.608904 -0.341621
2020-01-01 04:00:00 36.8 8.689767 21.724403 1.793306 0.571714 -0.710558 22.915339 -0.463528
... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 11.7 8.585531 11.187433 1.018248 0.065087 0.030320 11.733325 -0.567763
2020-12-31 20:00:00 11.0 10.310694 9.855659 0.788847 0.049502 -0.077022 11.774386 1.157399
2020-12-31 21:00:00 15.3 8.988753 10.415834 1.015028 0.096924 -0.112138 11.251107 -0.164541
2020-12-31 22:00:00 17.1 9.330045 10.346726 0.955586 0.104383 -0.266882 11.316565 0.176750
2020-12-31 23:00:00 15.2 9.275574 10.362161 0.748316 0.091792 -0.295992 11.028556 0.122279

6373 rows × 8 columns

[34]:
df_dewca, mod_stats=normet.decom_emi(df1a, model=automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)
2024-07-04 12:12:50 : Subtracting base...
2024-07-04 12:12:52 : Subtracting date_unix... ETA: 10.73 seconds
2024-07-04 12:12:55 : Subtracting day_julian... ETA: 7.97 seconds
2024-07-04 12:12:58 : Subtracting weekday... ETA: 5.31 seconds
2024-07-04 12:13:00 : Subtracting hour... ETA: 2.64 seconds
[35]:
df_dewcb, mod_stats=normet.decom_met(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)
2024-07-04 12:13:03 : Training AutoML...
2024-07-04 12:14:03 : Best model is xgboost with best model parameters of {'n_estimators': 80, 'max_leaves': 179, 'min_child_weight': 0.03469842703470486, 'learning_rate': 0.09050333780681966, 'subsample': 0.8788794424065157, 'colsample_bylevel': 0.7372635897769984, 'colsample_bytree': 0.9535003009271207, 'reg_alpha': 0.09353166342028453, 'reg_lambda': 3.6044671382295674}
2024-07-04 12:14:04 : Subtracting deweathered...
2024-07-04 12:14:06 : Subtracting blh... ETA: 26.09 seconds
2024-07-04 12:14:09 : Subtracting u10... ETA: 23.09 seconds
2024-07-04 12:14:11 : Subtracting d2m... ETA: 20.13 seconds
2024-07-04 12:14:13 : Subtracting sp... ETA: 17.28 seconds
2024-07-04 12:14:16 : Subtracting v10... ETA: 14.58 seconds
2024-07-04 12:14:18 : Subtracting t2m... ETA: 11.93 seconds
2024-07-04 12:14:20 : Subtracting tp... ETA: 9.41 seconds
2024-07-04 12:14:22 : Subtracting tcc... ETA: 6.94 seconds
2024-07-04 12:14:24 : Subtracting ssrd... ETA: 4.55 seconds
2024-07-04 12:14:26 : Subtracting rh2m... ETA: 2.24 seconds
[36]:
df_dewcb, mod_stats=normet.decom_met(df1a, model=automl, feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)
2024-07-04 12:14:28 : Subtracting deweathered...
2024-07-04 12:14:30 : Subtracting blh... ETA: 26.35 seconds
2024-07-04 12:14:33 : Subtracting u10... ETA: 22.99 seconds
2024-07-04 12:14:35 : Subtracting d2m... ETA: 20.03 seconds
2024-07-04 12:14:38 : Subtracting sp... ETA: 17.18 seconds
2024-07-04 12:14:40 : Subtracting v10... ETA: 14.50 seconds
2024-07-04 12:14:42 : Subtracting t2m... ETA: 11.98 seconds
2024-07-04 12:14:44 : Subtracting tp... ETA: 9.42 seconds
2024-07-04 12:14:46 : Subtracting tcc... ETA: 6.97 seconds
2024-07-04 12:14:48 : Subtracting ssrd... ETA: 4.57 seconds
2024-07-04 12:14:50 : Subtracting rh2m... ETA: 2.25 seconds
[37]:
df_dewcb
[37]:
observed deweathered blh u10 d2m sp v10 t2m tp tcc ssrd rh2m met_noise
date
2020-01-01 00:00:00 58.1 22.521753 3.105356 15.690435 13.370102 3.032124 6.203915 5.311699 2.730858 1.852100 0.520401 0.291279 9.400610
2020-01-01 01:00:00 43.2 23.722921 1.032499 10.332563 10.146141 3.130753 5.301819 4.798836 2.912220 2.052608 1.157616 -0.196972 -0.640710
2020-01-01 02:00:00 43.0 22.980244 0.215631 6.851231 8.113451 3.094269 4.130383 4.713776 3.276894 2.710617 2.732285 1.285332 1.364532
2020-01-01 03:00:00 42.8 22.608904 -0.558086 5.456263 8.559235 3.648994 3.184381 4.667519 3.397984 2.791050 3.348179 1.633690 1.993581
2020-01-01 04:00:00 36.8 22.915339 -0.296980 1.798401 4.787289 3.438236 1.801090 3.431309 2.871231 1.892057 2.969601 2.141823 1.182835
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 11.7 11.733325 -0.629538 -0.715612 -0.015411 0.308806 0.392364 0.891096 0.799346 -0.003299 -0.062504 -0.241484 -0.272833
2020-12-31 20:00:00 11.0 11.774386 -0.537662 -0.538769 -0.091031 0.148932 0.305520 0.786525 0.761405 -0.405293 -0.437243 -0.193840 -0.571940
2020-12-31 21:00:00 15.3 11.251107 -0.243903 -0.258332 0.046582 0.338228 0.362024 0.831674 0.803146 -0.466408 -0.422866 -0.049072 3.652803
2020-12-31 22:00:00 17.1 11.316565 4.063190 2.834073 -1.142671 2.329886 2.193111 -0.082521 0.252815 0.015556 -0.009989 0.315748 0.370693
2020-12-31 23:00:00 15.2 11.028556 4.984342 2.931353 -2.154545 2.283448 2.286663 -0.245660 0.163794 0.237981 0.146729 -0.258326 -0.777352

6373 rows × 13 columns

[38]:
df_dewca, mod_stats=normet.decom_emi(df1a, model=automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)
2024-07-04 12:14:52 : Subtracting base...
2024-07-04 12:14:55 : Subtracting date_unix... ETA: 10.64 seconds
2024-07-04 12:14:57 : Subtracting day_julian... ETA: 8.06 seconds
2024-07-04 12:15:00 : Subtracting weekday... ETA: 5.31 seconds
2024-07-04 12:15:03 : Subtracting hour... ETA: 2.64 seconds
[39]:
df_dewca
[39]:
observed base date_unix day_julian weekday hour deweathered emi_noise
date
2020-01-01 00:00:00 58.1 9.635087 20.808069 1.449459 0.553579 -0.750149 22.521753 0.460794
2020-01-01 01:00:00 43.2 9.427952 21.430550 1.942423 0.545198 -0.448908 23.722921 0.253659
2020-01-01 02:00:00 43.0 9.077608 21.271500 1.732838 0.660833 -0.588243 22.980244 -0.096684
2020-01-01 03:00:00 42.8 8.548481 21.601494 1.873583 0.566668 -0.807030 22.608904 -0.625812
2020-01-01 04:00:00 36.8 8.893300 21.541868 1.793306 0.571714 -0.710558 22.915339 -0.280993
... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 11.7 9.192358 10.601604 1.018248 0.065087 0.030320 11.733325 0.018065
2020-12-31 20:00:00 11.0 9.905566 10.281785 0.788847 0.049502 -0.077022 11.774386 0.731274
2020-12-31 21:00:00 15.3 8.519252 10.906334 1.015028 0.096924 -0.112138 11.251107 -0.655041
2020-12-31 22:00:00 17.1 8.960856 10.736913 0.955586 0.104383 -0.266882 11.316565 -0.213436
2020-12-31 23:00:00 15.2 9.216323 10.442410 0.748316 0.091792 -0.295992 11.028556 0.042030

6373 rows × 8 columns

Rolling weather normalisation

[40]:
df_dewc1, mod_stats=normet.rolling_dew(df1a, automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], n_samples=100,window_days=14, rollingevery=7)
2024-07-04 12:15:06 : Rolling window 0 from 2020-01-01 to 2020-01-15
2024-07-04 12:15:08 : Rolling window 10 from 2020-03-13 to 2020-03-27 ETA: 7.84 seconds
2024-07-04 12:15:11 : Rolling window 20 from 2020-05-22 to 2020-06-05 ETA: 5.14 seconds
2024-07-04 12:15:14 : Rolling window 30 from 2020-08-02 to 2020-08-16 ETA: 2.45 seconds
[41]:
df_dewc1.head()
[41]:
observed rolling_0 rolling_1 rolling_2 rolling_3 rolling_4 rolling_5 rolling_6 rolling_7 rolling_8 ... rolling_29 rolling_30 rolling_31 rolling_32 rolling_33 rolling_34 rolling_35 rolling_36 rolling_37 rolling_38
date
2020-01-01 00:00:00 58.1 16.412506 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 01:00:00 43.2 17.619698 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 02:00:00 43.0 16.589432 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 03:00:00 42.8 16.524757 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 04:00:00 36.8 17.865799 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 40 columns

[42]:
df_dewc1.iloc[:,1:].plot()
[42]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_44_1.png
[43]:
df_dewc2, mod_stats=normet.rolling_met(df1a, automl,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=100,window_days=14,rollingevery=7,fraction=0.75, seed=7654321)
2024-07-04 12:15:18 : Rolling window 0 from 2020-01-01 to 2020-01-15
2024-07-04 12:15:21 : Rolling window 10 from 2020-03-13 to 2020-03-27 ETA: 7.85 seconds
2024-07-04 12:15:23 : Rolling window 20 from 2020-05-22 to 2020-06-05 ETA: 5.16 seconds
2024-07-04 12:15:26 : Rolling window 30 from 2020-08-02 to 2020-08-16 ETA: 2.46 seconds
[44]:
df_dewc2.head()
[44]:
observed normalised emi_mean_14 emi_std_14 met_short met_season
date
2020-01-01 00:00:00 58.1 22.331083 NaN NaN NaN NaN
2020-01-01 01:00:00 43.2 23.972191 NaN NaN NaN NaN
2020-01-01 02:00:00 43.0 22.342634 NaN NaN NaN NaN
2020-01-01 03:00:00 42.8 22.705828 NaN NaN NaN NaN
2020-01-01 04:00:00 36.8 22.325943 NaN NaN NaN NaN
[45]:
df_dewc2['met_short'].plot()
[45]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_47_1.png
[46]:
df_dewc2['met_season'].plot()
[46]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_48_1.png

Partial Dependence Plots

[47]:
df1a=normet.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)
[48]:
df1a
[48]:
rowid u10 d2m rh2m v10 ssrd t2m blh tcc sp value tp date date_unix day_julian weekday hour set
0 0 -2.720528 277.183465 91.884130 1.545010 -1.164153e-10 278.394725 384.209053 0.650958 102252.303312 58.1 0.000008 2020-01-01 00:00:00 1.577837e+09 1 3 0 training
1 1 -2.308789 276.695430 92.715877 1.282742 -1.164153e-10 277.772899 353.220263 0.603699 102211.168636 43.2 0.000002 2020-01-01 01:00:00 1.577840e+09 1 3 1 training
2 2 -2.216471 276.505662 93.485560 0.758730 -1.164153e-10 277.463419 255.911846 0.710378 102174.855967 43.0 0.000005 2020-01-01 02:00:00 1.577844e+09 1 3 2 testing
3 3 -1.928623 276.412816 93.906363 0.509013 -1.164153e-10 277.305813 191.375560 0.837765 102166.786485 42.8 0.000005 2020-01-01 03:00:00 1.577848e+09 1 3 3 training
4 4 -1.700043 276.553051 93.696878 0.607069 -1.164153e-10 277.478941 151.780210 0.819103 102142.578039 36.8 0.000003 2020-01-01 04:00:00 1.577851e+09 1 3 4 testing
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6368 6368 1.380939 272.197565 90.582979 -1.945195 -5.820766e-11 273.557442 476.945688 0.918149 99902.506413 11.7 0.000000 2020-12-31 19:00:00 1.609441e+09 366 4 19 training
6369 6369 0.987925 272.171041 89.939908 -2.102732 -5.820766e-11 273.629146 486.665851 0.839639 99947.625909 11.0 0.000000 2020-12-31 20:00:00 1.609445e+09 366 4 20 training
6370 6370 0.681543 272.087408 90.422188 -1.933668 -5.820766e-11 273.470592 489.355002 0.739354 100000.215520 15.3 0.000000 2020-12-31 21:00:00 1.609448e+09 366 4 21 testing
6371 6371 1.020793 272.235319 95.088677 -0.583816 -5.820766e-11 272.926062 40.714872 0.643753 100042.844978 17.1 0.000000 2020-12-31 22:00:00 1.609452e+09 366 4 22 training
6372 6372 0.959517 272.020979 95.290673 -0.377511 -5.820766e-11 272.681367 55.617254 0.549403 100053.601944 15.2 0.000000 2020-12-31 23:00:00 1.609456e+09 366 4 23 training

6373 rows × 18 columns

[49]:
all_features=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']
pdp_value=normet.pdp_all(automl,df1a,feature_names=all_features)
[50]:
pdp_value
[50]:
variable value pdp_mean pdp_std
0 u10 -4.354032 10.963994 7.261466
1 u10 -4.242993 11.043289 7.309947
2 u10 -4.131954 11.042447 7.333392
3 u10 -4.020915 11.031183 7.338155
4 u10 -3.909875 11.040386 7.363017
... ... ... ... ...
1226 hour 19.000000 9.639515 7.460504
1227 hour 20.000000 9.578510 7.448645
1228 hour 21.000000 9.388046 7.372877
1229 hour 22.000000 9.182877 7.346088
1230 hour 23.000000 8.964516 7.354586

1231 rows × 4 columns

[ ]: