[1]:
import pandas as pd
import numpy as np
import time
import normet as nm
import matplotlib.pyplot as plt
import matplotlib
from pylab import savefig
[2]:
df1=pd.read_csv(r'data/MY1_data.csv',parse_dates=['date'],index_col='date')
[3]:
df1
[3]:
O3 NO NO2 NOXasNO2 SO2 CO PM10 NV10 V10 PM2.5 ... d2m t2m blh sp ssrd tcc tp rh2m lat lon
date
2020-01-01 00:00:00 1.72961 78.38595 45.77784 165.96796 4.75424 NaN 69.0 60.0 9.0 58.1 ... 277.183465 278.394725 384.209053 102252.303312 -1.164153e-10 0.650958 0.000008 91.884130 51.52253 -0.154611
2020-01-01 01:00:00 1.92918 88.61587 52.64325 188.51903 4.84394 0.397528 45.0 38.4 6.6 43.2 ... 276.695430 277.772899 353.220263 102211.168636 -1.164153e-10 0.603699 0.000002 92.715877 51.52253 -0.154611
2020-01-01 02:00:00 1.99570 70.02935 44.76870 152.14554 3.09474 0.346417 46.2 39.1 7.1 43.0 ... 276.505662 277.463419 255.911846 102174.855967 -1.164153e-10 0.710378 0.000005 93.485560 51.52253 -0.154611
2020-01-01 03:00:00 2.04559 67.58589 40.20699 143.83725 2.96019 0.335059 45.1 38.8 6.3 42.8 ... 276.412816 277.305813 191.375560 102166.786485 -1.164153e-10 0.837765 0.000005 93.906363 51.52253 -0.154611
2020-01-01 04:00:00 2.99355 72.03298 47.26010 157.70912 3.83478 0.349257 40.8 34.2 6.6 36.8 ... 276.553051 277.478941 151.780210 102142.578039 -1.164153e-10 0.819103 0.000003 93.696878 51.52253 -0.154611
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 20.70539 12.46950 30.47461 49.59424 1.12164 0.129904 13.9 11.8 2.1 11.7 ... 272.197565 273.557442 476.945688 99902.506413 -5.820766e-11 0.918149 0.000000 90.582979 51.52253 -0.154611
2020-12-31 20:00:00 24.14797 9.65279 26.51175 41.31249 1.12164 0.094180 14.6 11.3 3.3 11.0 ... 272.171041 273.629146 486.665851 99947.625909 -5.820766e-11 0.839639 0.000000 89.939908 51.52253 -0.154611
2020-12-31 21:00:00 25.69464 12.46950 28.45232 47.57196 1.36199 0.087685 16.6 13.0 3.6 15.3 ... 272.087408 273.470592 489.355002 100000.215520 -5.820766e-11 0.739354 0.000000 90.422188 51.52253 -0.154611
2020-12-31 22:00:00 26.39313 6.45629 25.05721 34.95672 0.88129 0.084437 19.1 16.0 3.1 17.1 ... 272.235319 272.926062 40.714872 100042.844978 -5.820766e-11 0.643753 0.000000 95.088677 51.52253 -0.154611
2020-12-31 23:00:00 27.93980 6.61453 22.07004 32.21218 0.88129 0.087685 17.6 13.2 4.4 15.2 ... 272.020979 272.681367 55.617254 100053.601944 -5.820766e-11 0.549403 0.000000 95.290673 51.52253 -0.154611

8784 rows × 65 columns

AutoML-based weather normalisation

[4]:
df1a=nm.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)
[5]:
model_config = {
        'time_budget': 90,                     # Total running time in seconds
        'metric': 'r2',                        # Primary metric for regression, 'mae', 'mse', 'r2', 'mape',...
        'estimator_list': ["lgbm"],            # List of ML learners: "lgbm", "rf", "xgboost", "extra_tree", "xgb_limitdepth"
    }
[6]:
model1=nm.train_model(df1a,variables=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],model_config =model_config );
2024-09-24 16:20:23 : Training AutoML...
2024-09-24 16:21:57 : Best model is lgbm with best model parameters of {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}
[7]:
model1
[7]:
AutoML(append_log=False, auto_augment=True, custom_hp={},
       cv_score_agg_func=None, early_stop=False, ensemble=False,
       estimator_list='auto', eval_method='auto', fit_kwargs_by_estimator={},
       force_cancel=False, free_mem_ratio=0, hpo_method='auto',
       keep_search_state=False, learner_selector='sample', log_file_name='',
       log_training_metric=False, log_type='better', max_iter=None,
       mem_thres=4294967296, metric='auto', metric_constraints=[],
       min_sample_size=10000, mlflow_logging=True, model_history=False,
       n_concurrent_trials=1, n_jobs=-1, n_splits=5, pred_time_limit=inf,
       preserve_checkpoint=True, retrain_full=True, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
[8]:
best_model = model1.best_estimator
best_config = model1.best_config
print("Best model:", best_model)
print("Best model parameters:", best_config)
model1.best_result
Best model: lgbm
Best model parameters: {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}
[8]:
{'pred_time': 1.9110346438994467e-05,
 'wall_clock_time': 90.33755874633789,
 'metric_for_logging': {'pred_time': 1.9110346438994467e-05},
 'val_loss': 0.12917426813136582,
 'training_iteration': 1,
 'config': {'n_estimators': 4779,
  'num_leaves': 37,
  'min_child_samples': 13,
  'learning_rate': 0.05353325544814332,
  'log_max_bin': 10,
  'colsample_bytree': 0.7006661480744041,
  'reg_alpha': 3.4963871783049667,
  'reg_lambda': 0.12148015741620988},
 'config/n_estimators': 4779,
 'config/num_leaves': 37,
 'config/min_child_samples': 13,
 'config/learning_rate': 0.05353325544814332,
 'config/log_max_bin': 10,
 'config/colsample_bytree': 0.7006661480744041,
 'config/reg_alpha': 3.4963871783049667,
 'config/reg_lambda': 0.12148015741620988,
 'experiment_tag': 'exp',
 'time_total_s': 9.342859029769897}
[9]:
model1.feature_importances_
[9]:
array([ 993, 6549, 6934, 5345, 5295, 6633, 6315, 4237, 6126, 3377, 6176,
       4652, 1062, 3878], dtype=int32)
[10]:
model1.feature_names_in_
[10]:
['weekday',
 'u10',
 'v10',
 'd2m',
 't2m',
 'blh',
 'sp',
 'ssrd',
 'tcc',
 'tp',
 'rh2m',
 'date_unix',
 'day_julian',
 'hour']
[11]:
model_config = {
        'time_budget': 90,
        #'max_models': 10,              # Maximum number of models to train
        #'max_mem_size': '12g',         # Maximum memory size for H2O
        'estimator_list': ['GBM'],     # List of algorithms to use in AutoML
    }
[12]:
model2=nm.train_model(df1a,automl_pkg='h2o',variables=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],model_config=model_config);
H2O is not running. Starting H2O...
Checking whether there is an H2O instance running at http://localhost:54321. connected.
H2O_cluster_uptime: 3 hours 59 mins
H2O_cluster_timezone: Europe/London
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.46.0.5
H2O_cluster_version_age: 26 days
H2O_cluster_name: H2O_from_python_n94921cs_5qrqdn
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 7.572 Gb
H2O_cluster_total_cores: 8
H2O_cluster_allowed_cores: 1
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
Python_version: 3.12.2 final
2024-09-24 16:21:58: Training AutoML...
2024-09-24 16:22:42: Best model obtained! - GBM_grid_1_AutoML_4_20240924_162158_model_2
[13]:
model2
[13]:
Model Details
=============
H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_grid_1_AutoML_4_20240924_162158_model_2
Model Summary:
number_of_trees number_of_internal_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves
69.0 69.0 1189276.0 13.0 13.0 13.0 625.0 1942.0 1369.2754
ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.011242541344013361
RMSE: 0.10603085090676846
MAE: 0.07893400538333294
RMSLE: NaN
Mean Residual Deviance: 0.011242541344013361
ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 8.126360199772725
RMSE: 2.850677147586644
MAE: 1.9222905950632483
RMSLE: NaN
Mean Residual Deviance: 8.126360199772725
Cross-Validation Metrics Summary:
mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
aic nan 0.0 nan nan nan nan nan
loglikelihood nan 0.0 nan nan nan nan nan
mae 1.9222906 0.0324555 1.9084675 1.9052794 1.888859 1.9710108 1.9378362
mean_residual_deviance 8.12636 0.6548642 8.182448 7.1749625 7.8459816 8.850865 8.577542
mse 8.12636 0.6548642 8.182448 7.1749625 7.8459816 8.850865 8.577542
r2 0.8793102 0.0095628 0.8805432 0.8902119 0.8867002 0.8692322 0.8698634
residual_deviance 8.12636 0.6548642 8.182448 7.1749625 7.8459816 8.850865 8.577542
rmse 2.8487926 0.1158737 2.860498 2.678612 2.801068 2.9750404 2.928744
rmsle nan 0.0 nan nan nan nan nan
Scoring History:
timestamp duration number_of_trees training_rmse training_mae training_deviance
2024-09-24 16:22:28 7.661 sec 0.0 8.2062346 5.4746640 67.3422868
2024-09-24 16:22:28 7.723 sec 5.0 5.0442433 3.4151840 25.4443903
2024-09-24 16:22:28 7.779 sec 10.0 3.1914116 2.2053144 10.1851080
2024-09-24 16:22:28 7.835 sec 15.0 2.0162629 1.4358716 4.0653160
2024-09-24 16:22:28 7.892 sec 20.0 1.3258726 0.9765391 1.7579382
2024-09-24 16:22:28 7.957 sec 25.0 0.8801164 0.6675258 0.7746049
2024-09-24 16:22:28 8.014 sec 30.0 0.6213492 0.4796348 0.3860749
2024-09-24 16:22:28 8.067 sec 35.0 0.4578002 0.3572500 0.2095810
2024-09-24 16:22:28 8.132 sec 40.0 0.3379802 0.2647451 0.1142306
2024-09-24 16:22:28 8.191 sec 45.0 0.2648459 0.2051404 0.0701433
2024-09-24 16:22:28 8.251 sec 50.0 0.2069501 0.1595394 0.0428284
2024-09-24 16:22:28 8.305 sec 55.0 0.1663724 0.1275311 0.0276798
2024-09-24 16:22:28 8.354 sec 60.0 0.1397592 0.1064105 0.0195326
2024-09-24 16:22:28 8.408 sec 65.0 0.1167263 0.0878304 0.0136250
2024-09-24 16:22:29 8.495 sec 69.0 0.1060309 0.0789340 0.0112425
Variable Importances:
variable relative_importance scaled_importance percentage
u10 251463.8437500 1.0 0.1484518
blh 210509.4843750 0.8371362 0.1242744
d2m 202005.6875000 0.8033190 0.1192542
day_julian 201374.0937500 0.8008073 0.1188813
sp 158984.7187500 0.6322369 0.0938567
date_unix 130323.0937500 0.5182578 0.0769363
t2m 108348.5078125 0.4308711 0.0639636
v10 87987.5078125 0.3499012 0.0519435
weekday 81752.5156250 0.3251064 0.0482626
rh2m 80490.9296875 0.3200895 0.0475179
hour 61422.0507812 0.2442580 0.0362605
tcc 57315.1562500 0.2279260 0.0338360
ssrd 46235.0937500 0.1838638 0.0272949
tp 15695.9404297 0.0624183 0.0092661

[tips]
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.
[14]:
?nm.train_model
Signature:
nm.train_model(
    df,
    value='value',
    automl_pkg='flaml',
    variables=None,
    model_config=None,
    seed=7654321,
    n_cores=None,
    verbose=True,
)
Docstring:
Trains a machine learning model using either FLAML or H2O AutoML.

Parameters:
    df (pandas.DataFrame): Input dataset to train the model.
    value (str): The name of the target column in the dataset. Default is "value".
    automl_pkg (str): The AutoML package to use ("flaml" or "h2o").
    variables (list, optional): List of feature variables to use for training.
    model_config (dict, optional): Configuration settings for the model training.
    seed (int, optional): Random seed for reproducibility. Default is 7654321.
    n_cores (int, optional): Number of CPU cores to use for training. Default is None.
    verbose (bool, optional): Whether to print detailed logs. Default is True.

Returns:
    model: Trained machine learning model with a custom attribute `_model_type` indicating the package used.
File:      ~/anaconda3/envs/normet/lib/python3.12/site-packages/normet/normet.py
Type:      function
[15]:
mod_stats1=nm.modStats(df1a,model1)
[16]:
mod_stats1
[16]:
n FAC2 MB MGE NMB NMGE RMSE r p_level COE IOA R2 set
0 4780 0.987238 -0.000007 0.268328 -7.437997e-07 0.029447 0.351283 0.999110 *** 0.950987 0.975494 0.998222 training
1 1593 0.907721 0.090223 1.920277 9.806910e-03 0.208727 2.699133 0.940890 *** 0.650259 0.825129 0.885274 testing
2 6373 0.967362 0.022547 0.681250 2.468418e-03 0.074582 1.383329 0.985535 *** 0.875668 0.937834 0.971279 all
[17]:
mod_stats2=nm.modStats(df1a,model2)
[18]:
mod_stats2
[18]:
n FAC2 MB MGE NMB NMGE RMSE r p_level COE IOA R2 set
0 4780 0.994142 2.145110e-08 0.078934 2.354070e-09 0.008662 0.106031 0.999923 *** 0.985582 0.992791 0.999846 training
1 1593 0.901444 1.458128e-01 1.952102 1.584932e-02 0.212186 2.753047 0.939557 *** 0.644462 0.822231 0.882767 testing
2 6373 0.970971 3.644749e-02 0.547153 3.990205e-03 0.059901 1.379475 0.985645 *** 0.900142 0.950071 0.971496 all
[19]:
start_time = time.time()
df_dew1=nm.normalise(df1a,model1,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")
2024-09-24 16:22:45: Normalising the dataset in parallel.
2024-09-24 16:22:47: Predicting using trained model in batches.
2024-09-24 16:25:07: Aggregating 1000 predictions...
Execution time: 142.41 seconds
[20]:
start_time = time.time()
df_dew2=nm.normalise(df1a,model2,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")
2024-09-24 16:25:08: Normalising the dataset in parallel.
2024-09-24 16:25:09: Predicting using trained model in batches.
2024-09-24 16:29:04: Aggregating 1000 predictions...
Execution time: 236.45 seconds
[21]:
fig,ax=plt.subplots()
df_dew1['normalised'].plot()
df_dew2['normalised'].plot()
[21]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_21_1.png
[22]:
start_time = time.time()
df_dew3=nm.normalise(df1a,model1,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=False)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")
2024-09-24 16:29:05: Normalising the dataset in parallel.
2024-09-24 16:29:06: Predicting using trained model in batches.
Execution time: 143.14 seconds
[23]:
df_dew3.head()
[23]:
date observed normalised seed
0 2020-01-01 00:00:00 58.1 17.221536 979812
1 2020-01-01 01:00:00 43.2 32.937436 979812
2 2020-01-01 02:00:00 43.0 24.423807 979812
3 2020-01-01 03:00:00 42.8 23.372347 979812
4 2020-01-01 04:00:00 36.8 26.478917 979812
[24]:
weather_df=df1.reset_index().iloc[0:100][['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m']]
[25]:
weather_df.head()
[25]:
u10 v10 d2m t2m blh sp ssrd tcc tp rh2m
0 -2.720528 1.545010 277.183465 278.394725 384.209053 102252.303312 -1.164153e-10 0.650958 0.000008 91.884130
1 -2.308789 1.282742 276.695430 277.772899 353.220263 102211.168636 -1.164153e-10 0.603699 0.000002 92.715877
2 -2.216471 0.758730 276.505662 277.463419 255.911846 102174.855967 -1.164153e-10 0.710378 0.000005 93.485560
3 -1.928623 0.509013 276.412816 277.305813 191.375560 102166.786485 -1.164153e-10 0.837765 0.000005 93.906363
4 -1.700043 0.607069 276.553051 277.478941 151.780210 102142.578039 -1.164153e-10 0.819103 0.000003 93.696878
[26]:
start_time = time.time()
df_dew2=nm.normalise(df1a, model1, weather_df=weather_df,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=300,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")
2024-09-24 16:31:28: Normalising the dataset in parallel.
2024-09-24 16:31:28: Predicting using trained model in batches.
2024-09-24 16:32:10: Aggregating 300 predictions...
Execution time: 42.37 seconds
[27]:
df_dew1['normalised'].plot(c='r')
df_dew2['normalised'].plot(c='b')
[27]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_27_1.png
[28]:
model_config={
    'time_budget': 60,  # Total running time in seconds
    'metric': 'r2', #
}
[29]:
df1a.columns
[29]:
Index(['rowid', 'd2m', 'blh', 'ssrd', 't2m', 'v10', 'u10', 'sp', 'tp', 'tcc',
       'rh2m', 'date', 'value', 'date_unix', 'day_julian', 'weekday', 'hour',
       'set'],
      dtype='object')
[30]:
df_dew, mod_stats=nm.do_all(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)
2024-09-24 16:32:10 : Training AutoML...
2024-09-24 16:33:08 : Best model is lgbm with best model parameters of {'n_estimators': 527, 'num_leaves': 57, 'min_child_samples': 19, 'learning_rate': 0.10029209493914669, 'log_max_bin': 10, 'colsample_bytree': 0.777659907533841, 'reg_alpha': 5.054157418960246, 'reg_lambda': 0.023089272254781048}
2024-09-24 16:33:09: Normalising the dataset in parallel.
2024-09-24 16:33:09: Predicting using trained model in batches.
2024-09-24 16:33:14: Aggregating 100 predictions...
[31]:
df_dew, mod_stats=nm.do_all(df1a,model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)
2024-09-24 16:33:15: Normalising the dataset in parallel.
2024-09-24 16:33:15: Predicting using trained model in batches.
2024-09-24 16:33:28: Aggregating 100 predictions...
[32]:
#Resampling from given dataset
df_dew, mod_stats=nm.do_all(df1a,model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],weather_df=weather_df,model_config=model_config,n_samples=100)
2024-09-24 16:33:29: Normalising the dataset in parallel.
2024-09-24 16:33:29: Predicting using trained model in batches.
2024-09-24 16:33:42: Aggregating 100 predictions...
[33]:
df_dew, mod_stats=nm.do_all_unc(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],n_samples=100,n_models=5)
2024-09-24 16:35:15 : Progress: 20.00% (Model 1/5)... ETA: 6.22 minutes
2024-09-24 16:37:17 : Progress: 40.00% (Model 2/5)... ETA: 5.37 minutes
2024-09-24 16:39:04 : Progress: 60.00% (Model 3/5)... ETA: 3.58 minutes
2024-09-24 16:41:10 : Progress: 80.00% (Model 4/5)... ETA: 1.87 minutes
2024-09-24 16:42:49 : Progress: 100.00% (Model 5/5)... ETA: 0.00 seconds
[34]:
df_dew.head()
[34]:
observed normalised_979812 normalised_378829 normalised_120727 normalised_541475 normalised_488292 mean std median lower_bound upper_bound weighted
date
2020-01-01 00:00:00 58.1 30.252205 26.007282 21.472511 13.609010 13.521217 20.972445 6.655142 21.472511 13.529996 29.827713 20.220786
2020-01-01 01:00:00 43.2 30.718119 22.379913 22.043467 11.979618 14.756184 20.375460 6.570192 22.043467 12.257275 29.884298 18.638353
2020-01-01 02:00:00 43.0 30.120009 21.627895 20.220487 13.131916 14.394936 19.899048 6.060521 20.220487 13.258218 29.270797 18.202305
2020-01-01 03:00:00 42.8 28.591281 21.836566 22.887696 11.813100 12.544112 19.534551 6.435126 21.836566 11.886201 28.020922 18.432641
2020-01-01 04:00:00 36.8 30.390922 21.896232 21.653471 12.113235 12.944652 19.799702 6.724275 21.653471 12.196376 29.541453 18.257228

Time series decomposition

[35]:
df_dewca, mod_stats=nm.decom_emi(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], split_method = 'random', fraction=0.75, n_samples=300)
2024-09-24 16:42:49 : Training AutoML...
2024-09-24 16:44:15 : Best model is lgbm with best model parameters of {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}
2024-09-24 16:44:15 : Subtracting base...
2024-09-24 16:44:59 : Subtracting date_unix... ETA: 2.93 minutes
2024-09-24 16:45:41 : Subtracting day_julian... ETA: 2.16 minutes
2024-09-24 16:46:27 : Subtracting weekday... ETA: 1.46 minutes
2024-09-24 16:47:12 : Subtracting hour... ETA: 44.25 seconds
[36]:
df_dewca
[36]:
observed base date_unix day_julian weekday hour deweathered emi_noise
date
2020-01-01 00:00:00 58.1 9.834232 18.762883 3.617389 0.991752 0.271323 24.320077 0.676729
2020-01-01 01:00:00 43.2 9.211363 19.549169 3.381027 0.795226 0.420953 24.200235 0.053860
2020-01-01 02:00:00 43.0 8.740657 18.760258 4.192777 0.800782 -0.187559 23.149413 -0.416846
2020-01-01 03:00:00 42.8 8.869214 18.804496 3.959970 0.529601 -0.208110 22.797669 -0.288289
2020-01-01 04:00:00 36.8 8.184215 19.394484 3.038679 0.761000 0.107604 22.328479 -0.973288
... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 11.7 8.621910 12.759990 0.336607 -0.046839 0.406496 12.920661 -0.535593
2020-12-31 20:00:00 11.0 8.749583 12.430059 0.528524 -0.321991 0.086078 12.314750 -0.407920
2020-12-31 21:00:00 15.3 8.957951 12.361814 0.078852 -0.219580 0.304223 12.325758 -0.199552
2020-12-31 22:00:00 17.1 11.150086 10.425434 0.642186 -0.419770 0.013659 12.654092 1.992583
2020-12-31 23:00:00 15.2 8.579548 12.403958 0.512501 0.115038 -0.705544 11.747998 -0.577955

6373 rows × 8 columns

[37]:
df_dewca, mod_stats=nm.decom_emi(df1a, model=model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)
2024-09-24 16:47:57 : Subtracting base...
2024-09-24 16:48:43 : Subtracting date_unix... ETA: 3.06 minutes
2024-09-24 16:49:30 : Subtracting day_julian... ETA: 2.31 minutes
2024-09-24 16:50:13 : Subtracting weekday... ETA: 1.51 minutes
2024-09-24 16:50:54 : Subtracting hour... ETA: 44.25 seconds
[38]:
df_dewcb, mod_stats=nm.decom_met(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)
2024-09-24 16:51:38 : Training AutoML...
2024-09-24 16:53:13 : Best model is lgbm with best model parameters of {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}
2024-09-24 16:53:14 : Subtracting deweathered...
2024-09-24 16:53:59 : Subtracting v10... ETA: 7.63 minutes
2024-09-24 16:54:42 : Subtracting blh... ETA: 6.60 minutes
2024-09-24 16:55:24 : Subtracting u10... ETA: 5.79 minutes
2024-09-24 16:56:05 : Subtracting sp... ETA: 4.98 minutes
2024-09-24 16:56:45 : Subtracting rh2m... ETA: 4.22 minutes
2024-09-24 16:57:25 : Subtracting tcc... ETA: 3.49 minutes
2024-09-24 16:58:05 : Subtracting d2m... ETA: 2.78 minutes
2024-09-24 16:58:44 : Subtracting t2m... ETA: 2.07 minutes
2024-09-24 16:59:24 : Subtracting ssrd... ETA: 1.37 minutes
2024-09-24 17:00:06 : Subtracting tp... ETA: 41.25 seconds
[39]:
df_dewcb, mod_stats=nm.decom_met(df1a, model=model1, feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)
2024-09-24 17:00:49 : Subtracting deweathered...
2024-09-24 17:01:32 : Subtracting v10... ETA: 7.12 minutes
2024-09-24 17:02:15 : Subtracting blh... ETA: 6.50 minutes
2024-09-24 17:03:01 : Subtracting u10... ETA: 5.86 minutes
2024-09-24 17:03:45 : Subtracting sp... ETA: 5.13 minutes
2024-09-24 17:04:29 : Subtracting rh2m... ETA: 4.40 minutes
2024-09-24 17:05:11 : Subtracting tcc... ETA: 3.64 minutes
2024-09-24 17:05:53 : Subtracting d2m... ETA: 2.89 minutes
2024-09-24 17:06:38 : Subtracting t2m... ETA: 2.18 minutes
2024-09-24 17:07:17 : Subtracting ssrd... ETA: 1.44 minutes
2024-09-24 17:07:57 : Subtracting tp... ETA: 42.86 seconds
[40]:
df_dewcb
[40]:
observed deweathered v10 blh u10 sp rh2m tcc d2m t2m ssrd tp met_noise
date
2020-01-01 00:00:00 58.1 23.775707 2.803204 2.690817 10.533843 14.459222 6.452000 5.053862 4.216987 5.003190 4.792167 5.521003 1.596199
2020-01-01 01:00:00 43.2 23.712904 1.546627 4.394804 11.799731 10.566854 1.769647 1.297532 1.966014 2.295684 1.542771 1.862642 -0.930420
2020-01-01 02:00:00 43.0 23.308907 1.740158 6.840758 12.729884 8.339736 1.449420 1.877752 2.321789 0.919309 -0.113854 2.106147 -0.392609
2020-01-01 03:00:00 42.8 21.808085 2.402217 10.777716 12.743709 5.307592 1.729576 2.229663 2.012881 0.785298 0.305861 1.625649 0.265997
2020-01-01 04:00:00 36.8 22.173301 1.698096 8.612109 9.510499 3.318336 1.694841 2.583169 2.313238 0.499432 -0.465211 1.463167 -1.849514
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 11.7 12.869070 0.146833 -0.498743 -0.443972 0.866222 0.683517 -0.338941 -0.219422 -0.602156 -1.466656 -0.560928 -0.034524
2020-12-31 20:00:00 11.0 12.018398 0.320049 -0.056764 -0.113305 0.966535 0.720601 -0.261063 -0.324155 -1.152303 -1.581597 -0.370613 -0.144190
2020-12-31 21:00:00 15.3 12.231738 0.397497 -0.242102 -0.911311 0.589118 0.786209 -0.246796 -0.040521 -0.713239 -1.326826 -0.321144 4.002425
2020-12-31 22:00:00 17.1 12.514066 0.676143 4.688038 1.570879 -1.779531 1.186440 0.857997 1.715646 0.982372 -0.903092 -0.228862 0.065919
2020-12-31 23:00:00 15.2 11.770696 0.539106 5.628566 2.067334 -2.426622 0.414002 0.017931 1.326883 0.882671 -0.988648 -0.456333 -0.216909

6373 rows × 13 columns

[41]:
df_dewca, mod_stats=nm.decom_emi(df1a, model=model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)
2024-09-24 17:08:35 : Subtracting base...
2024-09-24 17:09:22 : Subtracting date_unix... ETA: 3.10 minutes
2024-09-24 17:10:09 : Subtracting day_julian... ETA: 2.36 minutes
2024-09-24 17:10:55 : Subtracting weekday... ETA: 1.56 minutes
2024-09-24 17:11:40 : Subtracting hour... ETA: 46.26 seconds
[42]:
df_dewca
[42]:
observed base date_unix day_julian weekday hour deweathered emi_noise
date
2020-01-01 00:00:00 58.1 9.636890 18.471507 3.981692 1.373361 -0.159863 24.146201 0.479505
2020-01-01 01:00:00 43.2 9.560233 18.945920 3.962606 0.950994 -0.031199 24.231168 0.402847
2020-01-01 02:00:00 43.0 9.457613 18.446638 3.478760 1.149818 -0.735301 22.640142 0.300227
2020-01-01 03:00:00 42.8 9.553435 17.988441 4.450814 0.201102 -0.431935 22.604472 0.396050
2020-01-01 04:00:00 36.8 8.259178 18.887683 3.579915 1.171948 -0.676739 22.064599 -0.898208
... ... ... ... ... ... ... ... ...
2020-12-31 19:00:00 11.7 8.791405 12.619534 0.570769 -0.282182 0.277319 12.819460 -0.365980
2020-12-31 20:00:00 11.0 8.768305 12.125637 0.483223 0.000221 -0.067327 12.152672 -0.389080
2020-12-31 21:00:00 15.3 8.492577 12.543496 0.251146 0.147919 -0.081575 12.196178 -0.664809
2020-12-31 22:00:00 17.1 9.704561 12.125288 0.186412 -0.348574 0.029269 12.539570 0.547175
2020-12-31 23:00:00 15.2 8.273833 12.567619 0.757832 -0.068558 -0.986782 11.386559 -0.883552

6373 rows × 8 columns

Rolling weather normalisation

[43]:
df_dewc1, mod_stats=nm.rolling(df1a, model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], n_samples=100,window_days=14, rolling_every=7)
2024-09-24 17:12:28: Rolling window 0 from 2020-01-01 to 2020-01-14
2024-09-24 17:12:37: Rolling window 10 from 2020-03-03 to 2020-03-16
2024-09-24 17:12:47: Rolling window 20 from 2020-05-02 to 2020-05-15
2024-09-24 17:12:55: Rolling window 30 from 2020-07-01 to 2020-07-14
2024-09-24 17:13:02: Rolling window 40 from 2020-09-17 to 2020-09-30
[44]:
df_dewc1.head()
[44]:
observed rolling_0 rolling_1 rolling_2 rolling_3 rolling_4 rolling_5 rolling_6 rolling_7 rolling_8 ... rolling_35 rolling_36 rolling_37 rolling_38 rolling_39 rolling_40 rolling_41 rolling_42 rolling_43 rolling_44
date
2020-01-01 00:00:00 58.1 20.207316 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 01:00:00 43.2 20.131948 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 02:00:00 43.0 19.247751 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 03:00:00 42.8 19.310005 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2020-01-01 04:00:00 36.8 20.072632 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 46 columns

[45]:
df_dewc1.iloc[:,1:].plot()
[45]:
<Axes: xlabel='date'>
../_images/notebooks_Case1_47_1.png

Partial Dependence Plots

[46]:
df1a=nm.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)
[47]:
df1a
[47]:
rowid d2m blh ssrd t2m v10 u10 sp tp tcc rh2m date value date_unix day_julian weekday hour set
0 0 277.183465 384.209053 -1.164153e-10 278.394725 1.545010 -2.720528 102252.303312 0.000008 0.650958 91.884130 2020-01-01 00:00:00 58.1 1.577837e+09 1 3 0 training
1 1 276.695430 353.220263 -1.164153e-10 277.772899 1.282742 -2.308789 102211.168636 0.000002 0.603699 92.715877 2020-01-01 01:00:00 43.2 1.577840e+09 1 3 1 training
2 2 276.505662 255.911846 -1.164153e-10 277.463419 0.758730 -2.216471 102174.855967 0.000005 0.710378 93.485560 2020-01-01 02:00:00 43.0 1.577844e+09 1 3 2 testing
3 3 276.412816 191.375560 -1.164153e-10 277.305813 0.509013 -1.928623 102166.786485 0.000005 0.837765 93.906363 2020-01-01 03:00:00 42.8 1.577848e+09 1 3 3 training
4 4 276.553051 151.780210 -1.164153e-10 277.478941 0.607069 -1.700043 102142.578039 0.000003 0.819103 93.696878 2020-01-01 04:00:00 36.8 1.577851e+09 1 3 4 testing
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6368 6368 272.197565 476.945688 -5.820766e-11 273.557442 -1.945195 1.380939 99902.506413 0.000000 0.918149 90.582979 2020-12-31 19:00:00 11.7 1.609441e+09 366 4 19 training
6369 6369 272.171041 486.665851 -5.820766e-11 273.629146 -2.102732 0.987925 99947.625909 0.000000 0.839639 89.939908 2020-12-31 20:00:00 11.0 1.609445e+09 366 4 20 training
6370 6370 272.087408 489.355002 -5.820766e-11 273.470592 -1.933668 0.681543 100000.215520 0.000000 0.739354 90.422188 2020-12-31 21:00:00 15.3 1.609448e+09 366 4 21 testing
6371 6371 272.235319 40.714872 -5.820766e-11 272.926062 -0.583816 1.020793 100042.844978 0.000000 0.643753 95.088677 2020-12-31 22:00:00 17.1 1.609452e+09 366 4 22 training
6372 6372 272.020979 55.617254 -5.820766e-11 272.681367 -0.377511 0.959517 100053.601944 0.000000 0.549403 95.290673 2020-12-31 23:00:00 15.2 1.609456e+09 366 4 23 training

6373 rows × 18 columns

[49]:
all_features=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']
pdp_value=nm.pdp(df1a,model1,var_list=['blh'])
[50]:
pdp_value
[50]:
variable value pdp_mean pdp_std
0 blh 73.415911 15.742461 8.358786
1 blh 88.917320 15.359839 8.271418
2 blh 104.418730 14.966788 8.354652
3 blh 119.920140 15.582338 8.428540
4 blh 135.421549 13.699515 7.471944
... ... ... ... ...
95 blh 1546.049822 6.940856 4.955734
96 blh 1561.551231 6.955089 4.957737
97 blh 1577.052641 6.960697 4.952725
98 blh 1592.554051 7.132266 4.943666
99 blh 1608.055460 7.126766 4.943877

100 rows × 4 columns

[51]:
pdp_value=nm.pdp(df1a,model1,var_list=['blh','t2m'])
[52]:
pdp_value
[52]:
variable value pdp_mean pdp_std
0 blh 73.415911 15.742461 8.358786
1 blh 88.917320 15.359839 8.271418
2 blh 104.418730 14.966788 8.354652
3 blh 119.920140 15.582338 8.428540
4 blh 135.421549 13.699515 7.471944
... ... ... ... ...
195 t2m 294.518468 10.544835 7.865555
196 t2m 294.715875 10.532480 7.856032
197 t2m 294.913281 10.565610 7.847180
198 t2m 295.110688 10.527926 7.840150
199 t2m 295.308095 10.515571 7.841594

200 rows × 4 columns

[53]:
all_features=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']
pdp_value=nm.pdp(df1a,model1)
[54]:
pdp_value
[54]:
variable value pdp_mean pdp_std
0 weekday 1.0 8.808568 7.348096
1 weekday 2.0 8.975131 7.397715
2 weekday 3.0 9.519126 7.803357
3 weekday 4.0 9.178989 7.852847
4 weekday 5.0 9.756207 7.741670
... ... ... ... ...
1226 hour 19.0 9.648269 7.468077
1227 hour 20.0 9.583308 7.474737
1228 hour 21.0 9.275591 7.467640
1229 hour 22.0 9.089483 7.493595
1230 hour 23.0 8.733808 7.531601

1231 rows × 4 columns

[ ]: