[1]:

import pandas as pd
import numpy as np
import time
import normet as nm
import matplotlib.pyplot as plt
import matplotlib
from pylab import savefig

[2]:

df1=pd.read_csv(r'data/MY1_data.csv',parse_dates=['date'],index_col='date')

[3]:

df1

[3]:

	O3	NO	NO2	NOXasNO2	SO2	CO	PM10	NV10	V10	PM2.5	...	d2m	t2m	blh	sp	ssrd	tcc	tp	rh2m	lat	lon
date
2020-01-01 00:00:00	1.72961	78.38595	45.77784	165.96796	4.75424	NaN	69.0	60.0	9.0	58.1	...	277.183465	278.394725	384.209053	102252.303312	-1.164153e-10	0.650958	0.000008	91.884130	51.52253	-0.154611
2020-01-01 01:00:00	1.92918	88.61587	52.64325	188.51903	4.84394	0.397528	45.0	38.4	6.6	43.2	...	276.695430	277.772899	353.220263	102211.168636	-1.164153e-10	0.603699	0.000002	92.715877	51.52253	-0.154611
2020-01-01 02:00:00	1.99570	70.02935	44.76870	152.14554	3.09474	0.346417	46.2	39.1	7.1	43.0	...	276.505662	277.463419	255.911846	102174.855967	-1.164153e-10	0.710378	0.000005	93.485560	51.52253	-0.154611
2020-01-01 03:00:00	2.04559	67.58589	40.20699	143.83725	2.96019	0.335059	45.1	38.8	6.3	42.8	...	276.412816	277.305813	191.375560	102166.786485	-1.164153e-10	0.837765	0.000005	93.906363	51.52253	-0.154611
2020-01-01 04:00:00	2.99355	72.03298	47.26010	157.70912	3.83478	0.349257	40.8	34.2	6.6	36.8	...	276.553051	277.478941	151.780210	102142.578039	-1.164153e-10	0.819103	0.000003	93.696878	51.52253	-0.154611
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	20.70539	12.46950	30.47461	49.59424	1.12164	0.129904	13.9	11.8	2.1	11.7	...	272.197565	273.557442	476.945688	99902.506413	-5.820766e-11	0.918149	0.000000	90.582979	51.52253	-0.154611
2020-12-31 20:00:00	24.14797	9.65279	26.51175	41.31249	1.12164	0.094180	14.6	11.3	3.3	11.0	...	272.171041	273.629146	486.665851	99947.625909	-5.820766e-11	0.839639	0.000000	89.939908	51.52253	-0.154611
2020-12-31 21:00:00	25.69464	12.46950	28.45232	47.57196	1.36199	0.087685	16.6	13.0	3.6	15.3	...	272.087408	273.470592	489.355002	100000.215520	-5.820766e-11	0.739354	0.000000	90.422188	51.52253	-0.154611
2020-12-31 22:00:00	26.39313	6.45629	25.05721	34.95672	0.88129	0.084437	19.1	16.0	3.1	17.1	...	272.235319	272.926062	40.714872	100042.844978	-5.820766e-11	0.643753	0.000000	95.088677	51.52253	-0.154611
2020-12-31 23:00:00	27.93980	6.61453	22.07004	32.21218	0.88129	0.087685	17.6	13.2	4.4	15.2	...	272.020979	272.681367	55.617254	100053.601944	-5.820766e-11	0.549403	0.000000	95.290673	51.52253	-0.154611

8784 rows × 65 columns

AutoML-based weather normalisation

[4]:

df1a=nm.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)

[5]:

model_config = {
        'time_budget': 90,                     # Total running time in seconds
        'metric': 'r2',                        # Primary metric for regression, 'mae', 'mse', 'r2', 'mape',...
        'estimator_list': ["lgbm"],            # List of ML learners: "lgbm", "rf", "xgboost", "extra_tree", "xgb_limitdepth"
    }

[6]:

model1=nm.train_model(df1a,variables=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],model_config =model_config );

2024-09-24 16:20:23 : Training AutoML...
2024-09-24 16:21:57 : Best model is lgbm with best model parameters of {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}

[7]:

model1

[7]:

AutoML(append_log=False, auto_augment=True, custom_hp={},
       cv_score_agg_func=None, early_stop=False, ensemble=False,
       estimator_list='auto', eval_method='auto', fit_kwargs_by_estimator={},
       force_cancel=False, free_mem_ratio=0, hpo_method='auto',
       keep_search_state=False, learner_selector='sample', log_file_name='',
       log_training_metric=False, log_type='better', max_iter=None,
       mem_thres=4294967296, metric='auto', metric_constraints=[],
       min_sample_size=10000, mlflow_logging=True, model_history=False,
       n_concurrent_trials=1, n_jobs=-1, n_splits=5, pred_time_limit=inf,
       preserve_checkpoint=True, retrain_full=True, ...)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

[8]:

best_model = model1.best_estimator
best_config = model1.best_config
print("Best model:", best_model)
print("Best model parameters:", best_config)
model1.best_result

Best model: lgbm
Best model parameters: {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}

[8]:

{'pred_time': 1.9110346438994467e-05,
 'wall_clock_time': 90.33755874633789,
 'metric_for_logging': {'pred_time': 1.9110346438994467e-05},
 'val_loss': 0.12917426813136582,
 'training_iteration': 1,
 'config': {'n_estimators': 4779,
  'num_leaves': 37,
  'min_child_samples': 13,
  'learning_rate': 0.05353325544814332,
  'log_max_bin': 10,
  'colsample_bytree': 0.7006661480744041,
  'reg_alpha': 3.4963871783049667,
  'reg_lambda': 0.12148015741620988},
 'config/n_estimators': 4779,
 'config/num_leaves': 37,
 'config/min_child_samples': 13,
 'config/learning_rate': 0.05353325544814332,
 'config/log_max_bin': 10,
 'config/colsample_bytree': 0.7006661480744041,
 'config/reg_alpha': 3.4963871783049667,
 'config/reg_lambda': 0.12148015741620988,
 'experiment_tag': 'exp',
 'time_total_s': 9.342859029769897}

[9]:

model1.feature_importances_

[9]:

array([ 993, 6549, 6934, 5345, 5295, 6633, 6315, 4237, 6126, 3377, 6176,
       4652, 1062, 3878], dtype=int32)

[10]:

model1.feature_names_in_

[10]:

['weekday',
 'u10',
 'v10',
 'd2m',
 't2m',
 'blh',
 'sp',
 'ssrd',
 'tcc',
 'tp',
 'rh2m',
 'date_unix',
 'day_julian',
 'hour']

[11]:

model_config = {
        'time_budget': 90,
        #'max_models': 10,              # Maximum number of models to train
        #'max_mem_size': '12g',         # Maximum memory size for H2O
        'estimator_list': ['GBM'],     # List of algorithms to use in AutoML
    }

[12]:

model2=nm.train_model(df1a,automl_pkg='h2o',variables=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],model_config=model_config);

H2O is not running. Starting H2O...
Checking whether there is an H2O instance running at http://localhost:54321. connected.


H2O_cluster_uptime:	3 hours 59 mins
H2O_cluster_timezone:	Europe/London
H2O_data_parsing_timezone:	UTC
H2O_cluster_version:	3.46.0.5
H2O_cluster_version_age:	26 days
H2O_cluster_name:	H2O_from_python_n94921cs_5qrqdn
H2O_cluster_total_nodes:	1
H2O_cluster_free_memory:	7.572 Gb
H2O_cluster_total_cores:	8
H2O_cluster_allowed_cores:	1
H2O_cluster_status:	locked, healthy
H2O_connection_url:	http://localhost:54321
H2O_connection_proxy:	{"http": null, "https": null}
H2O_internal_security:	False
Python_version:	3.12.2 final

2024-09-24 16:21:58: Training AutoML...
2024-09-24 16:22:42: Best model obtained! - GBM_grid_1_AutoML_4_20240924_162158_model_2

[13]:

model2

[13]:

Model Details
=============
H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_grid_1_AutoML_4_20240924_162158_model_2

Model Summary:
	number_of_trees	number_of_internal_trees	model_size_in_bytes	min_depth	max_depth	mean_depth	min_leaves	max_leaves	mean_leaves
	69.0	69.0	1189276.0	13.0	13.0	13.0	625.0	1942.0	1369.2754

ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.011242541344013361
RMSE: 0.10603085090676846
MAE: 0.07893400538333294
RMSLE: NaN
Mean Residual Deviance: 0.011242541344013361

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 8.126360199772725
RMSE: 2.850677147586644
MAE: 1.9222905950632483
RMSLE: NaN
Mean Residual Deviance: 8.126360199772725

Cross-Validation Metrics Summary:
	mean	sd	cv_1_valid	cv_2_valid	cv_3_valid	cv_4_valid	cv_5_valid
aic	nan	0.0	nan	nan	nan	nan	nan
loglikelihood	nan	0.0	nan	nan	nan	nan	nan
mae	1.9222906	0.0324555	1.9084675	1.9052794	1.888859	1.9710108	1.9378362
mean_residual_deviance	8.12636	0.6548642	8.182448	7.1749625	7.8459816	8.850865	8.577542
mse	8.12636	0.6548642	8.182448	7.1749625	7.8459816	8.850865	8.577542
r2	0.8793102	0.0095628	0.8805432	0.8902119	0.8867002	0.8692322	0.8698634
residual_deviance	8.12636	0.6548642	8.182448	7.1749625	7.8459816	8.850865	8.577542
rmse	2.8487926	0.1158737	2.860498	2.678612	2.801068	2.9750404	2.928744
rmsle	nan	0.0	nan	nan	nan	nan	nan

Scoring History:
timestamp	duration	number_of_trees	training_rmse	training_mae	training_deviance
2024-09-24 16:22:28	7.661 sec	0.0	8.2062346	5.4746640	67.3422868
2024-09-24 16:22:28	7.723 sec	5.0	5.0442433	3.4151840	25.4443903
2024-09-24 16:22:28	7.779 sec	10.0	3.1914116	2.2053144	10.1851080
2024-09-24 16:22:28	7.835 sec	15.0	2.0162629	1.4358716	4.0653160
2024-09-24 16:22:28	7.892 sec	20.0	1.3258726	0.9765391	1.7579382
2024-09-24 16:22:28	7.957 sec	25.0	0.8801164	0.6675258	0.7746049
2024-09-24 16:22:28	8.014 sec	30.0	0.6213492	0.4796348	0.3860749
2024-09-24 16:22:28	8.067 sec	35.0	0.4578002	0.3572500	0.2095810
2024-09-24 16:22:28	8.132 sec	40.0	0.3379802	0.2647451	0.1142306
2024-09-24 16:22:28	8.191 sec	45.0	0.2648459	0.2051404	0.0701433
2024-09-24 16:22:28	8.251 sec	50.0	0.2069501	0.1595394	0.0428284
2024-09-24 16:22:28	8.305 sec	55.0	0.1663724	0.1275311	0.0276798
2024-09-24 16:22:28	8.354 sec	60.0	0.1397592	0.1064105	0.0195326
2024-09-24 16:22:28	8.408 sec	65.0	0.1167263	0.0878304	0.0136250
2024-09-24 16:22:29	8.495 sec	69.0	0.1060309	0.0789340	0.0112425

Variable Importances:
variable	relative_importance	scaled_importance	percentage
u10	251463.8437500	1.0	0.1484518
blh	210509.4843750	0.8371362	0.1242744
d2m	202005.6875000	0.8033190	0.1192542
day_julian	201374.0937500	0.8008073	0.1188813
sp	158984.7187500	0.6322369	0.0938567
date_unix	130323.0937500	0.5182578	0.0769363
t2m	108348.5078125	0.4308711	0.0639636
v10	87987.5078125	0.3499012	0.0519435
weekday	81752.5156250	0.3251064	0.0482626
rh2m	80490.9296875	0.3200895	0.0475179
hour	61422.0507812	0.2442580	0.0362605
tcc	57315.1562500	0.2279260	0.0338360
ssrd	46235.0937500	0.1838638	0.0272949
tp	15695.9404297	0.0624183	0.0092661


[tips]
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.

[14]:

?nm.train_model

Signature:
nm.train_model(
    df,
    value='value',
    automl_pkg='flaml',
    variables=None,
    model_config=None,
    seed=7654321,
    n_cores=None,
    verbose=True,
)
Docstring:
Trains a machine learning model using either FLAML or H2O AutoML.

Parameters:
    df (pandas.DataFrame): Input dataset to train the model.
    value (str): The name of the target column in the dataset. Default is "value".
    automl_pkg (str): The AutoML package to use ("flaml" or "h2o").
    variables (list, optional): List of feature variables to use for training.
    model_config (dict, optional): Configuration settings for the model training.
    seed (int, optional): Random seed for reproducibility. Default is 7654321.
    n_cores (int, optional): Number of CPU cores to use for training. Default is None.
    verbose (bool, optional): Whether to print detailed logs. Default is True.

Returns:
    model: Trained machine learning model with a custom attribute `_model_type` indicating the package used.
File:      ~/anaconda3/envs/normet/lib/python3.12/site-packages/normet/normet.py
Type:      function

[15]:

mod_stats1=nm.modStats(df1a,model1)

[16]:

mod_stats1

[16]:

	n	FAC2	MB	MGE	NMB	NMGE	RMSE	r	p_level	COE	IOA	R2	set
0	4780	0.987238	-0.000007	0.268328	-7.437997e-07	0.029447	0.351283	0.999110	***	0.950987	0.975494	0.998222	training
1	1593	0.907721	0.090223	1.920277	9.806910e-03	0.208727	2.699133	0.940890	***	0.650259	0.825129	0.885274	testing
2	6373	0.967362	0.022547	0.681250	2.468418e-03	0.074582	1.383329	0.985535	***	0.875668	0.937834	0.971279	all

[17]:

mod_stats2=nm.modStats(df1a,model2)

[18]:

mod_stats2

[18]:

	n	FAC2	MB	MGE	NMB	NMGE	RMSE	r	p_level	COE	IOA	R2	set
0	4780	0.994142	2.145110e-08	0.078934	2.354070e-09	0.008662	0.106031	0.999923	***	0.985582	0.992791	0.999846	training
1	1593	0.901444	1.458128e-01	1.952102	1.584932e-02	0.212186	2.753047	0.939557	***	0.644462	0.822231	0.882767	testing
2	6373	0.970971	3.644749e-02	0.547153	3.990205e-03	0.059901	1.379475	0.985645	***	0.900142	0.950071	0.971496	all

[19]:

start_time = time.time()
df_dew1=nm.normalise(df1a,model1,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

2024-09-24 16:22:45: Normalising the dataset in parallel.
2024-09-24 16:22:47: Predicting using trained model in batches.
2024-09-24 16:25:07: Aggregating 1000 predictions...
Execution time: 142.41 seconds

[20]:

start_time = time.time()
df_dew2=nm.normalise(df1a,model2,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

2024-09-24 16:25:08: Normalising the dataset in parallel.
2024-09-24 16:25:09: Predicting using trained model in batches.
2024-09-24 16:29:04: Aggregating 1000 predictions...
Execution time: 236.45 seconds

[21]:

fig,ax=plt.subplots()
df_dew1['normalised'].plot()
df_dew2['normalised'].plot()

[21]:

<Axes: xlabel='date'>

[22]:

start_time = time.time()
df_dew3=nm.normalise(df1a,model1,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=1000,aggregate=False)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

2024-09-24 16:29:05: Normalising the dataset in parallel.
2024-09-24 16:29:06: Predicting using trained model in batches.
Execution time: 143.14 seconds

[23]:

df_dew3.head()

[23]:

	date	observed	normalised	seed
0	2020-01-01 00:00:00	58.1	17.221536	979812
1	2020-01-01 01:00:00	43.2	32.937436	979812
2	2020-01-01 02:00:00	43.0	24.423807	979812
3	2020-01-01 03:00:00	42.8	23.372347	979812
4	2020-01-01 04:00:00	36.8	26.478917	979812

[24]:

weather_df=df1.reset_index().iloc[0:100][['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m']]

[25]:

weather_df.head()

[25]:

	u10	v10	d2m	t2m	blh	sp	ssrd	tcc	tp	rh2m
0	-2.720528	1.545010	277.183465	278.394725	384.209053	102252.303312	-1.164153e-10	0.650958	0.000008	91.884130
1	-2.308789	1.282742	276.695430	277.772899	353.220263	102211.168636	-1.164153e-10	0.603699	0.000002	92.715877
2	-2.216471	0.758730	276.505662	277.463419	255.911846	102174.855967	-1.164153e-10	0.710378	0.000005	93.485560
3	-1.928623	0.509013	276.412816	277.305813	191.375560	102166.786485	-1.164153e-10	0.837765	0.000005	93.906363
4	-1.700043	0.607069	276.553051	277.478941	151.780210	102142.578039	-1.164153e-10	0.819103	0.000003	93.696878

[26]:

start_time = time.time()
df_dew2=nm.normalise(df1a, model1, weather_df=weather_df,
                           feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],
                          variables_resample= ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],
                          n_samples=300,aggregate=True)
end_time = time.time()

# 计算执行时间
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

2024-09-24 16:31:28: Normalising the dataset in parallel.
2024-09-24 16:31:28: Predicting using trained model in batches.
2024-09-24 16:32:10: Aggregating 300 predictions...
Execution time: 42.37 seconds

[27]:

df_dew1['normalised'].plot(c='r')
df_dew2['normalised'].plot(c='b')

[27]:

<Axes: xlabel='date'>

[28]:

model_config={
    'time_budget': 60,  # Total running time in seconds
    'metric': 'r2', #
}

[29]:

df1a.columns

[29]:

Index(['rowid', 'd2m', 'blh', 'ssrd', 't2m', 'v10', 'u10', 'sp', 'tp', 'tcc',
       'rh2m', 'date', 'value', 'date_unix', 'day_julian', 'weekday', 'hour',
       'set'],
      dtype='object')

[30]:

df_dew, mod_stats=nm.do_all(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)

2024-09-24 16:32:10 : Training AutoML...
2024-09-24 16:33:08 : Best model is lgbm with best model parameters of {'n_estimators': 527, 'num_leaves': 57, 'min_child_samples': 19, 'learning_rate': 0.10029209493914669, 'log_max_bin': 10, 'colsample_bytree': 0.777659907533841, 'reg_alpha': 5.054157418960246, 'reg_lambda': 0.023089272254781048}
2024-09-24 16:33:09: Normalising the dataset in parallel.
2024-09-24 16:33:09: Predicting using trained model in batches.
2024-09-24 16:33:14: Aggregating 100 predictions...

[31]:

df_dew, mod_stats=nm.do_all(df1a,model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],model_config=model_config,n_samples=100)

2024-09-24 16:33:15: Normalising the dataset in parallel.
2024-09-24 16:33:15: Predicting using trained model in batches.
2024-09-24 16:33:28: Aggregating 100 predictions...

[32]:

#Resampling from given dataset
df_dew, mod_stats=nm.do_all(df1a,model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],weather_df=weather_df,model_config=model_config,n_samples=100)

2024-09-24 16:33:29: Normalising the dataset in parallel.
2024-09-24 16:33:29: Predicting using trained model in batches.
2024-09-24 16:33:42: Aggregating 100 predictions...

[33]:

df_dew, mod_stats=nm.do_all_unc(df1,value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'],n_samples=100,n_models=5)

2024-09-24 16:35:15 : Progress: 20.00% (Model 1/5)... ETA: 6.22 minutes
2024-09-24 16:37:17 : Progress: 40.00% (Model 2/5)... ETA: 5.37 minutes
2024-09-24 16:39:04 : Progress: 60.00% (Model 3/5)... ETA: 3.58 minutes
2024-09-24 16:41:10 : Progress: 80.00% (Model 4/5)... ETA: 1.87 minutes
2024-09-24 16:42:49 : Progress: 100.00% (Model 5/5)... ETA: 0.00 seconds

[34]:

df_dew.head()

[34]:

	observed	normalised_979812	normalised_378829	normalised_120727	normalised_541475	normalised_488292	mean	std	median	lower_bound	upper_bound	weighted
date
2020-01-01 00:00:00	58.1	30.252205	26.007282	21.472511	13.609010	13.521217	20.972445	6.655142	21.472511	13.529996	29.827713	20.220786
2020-01-01 01:00:00	43.2	30.718119	22.379913	22.043467	11.979618	14.756184	20.375460	6.570192	22.043467	12.257275	29.884298	18.638353
2020-01-01 02:00:00	43.0	30.120009	21.627895	20.220487	13.131916	14.394936	19.899048	6.060521	20.220487	13.258218	29.270797	18.202305
2020-01-01 03:00:00	42.8	28.591281	21.836566	22.887696	11.813100	12.544112	19.534551	6.435126	21.836566	11.886201	28.020922	18.432641
2020-01-01 04:00:00	36.8	30.390922	21.896232	21.653471	12.113235	12.944652	19.799702	6.724275	21.653471	12.196376	29.541453	18.257228

Time series decomposition

[35]:

df_dewca, mod_stats=nm.decom_emi(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], split_method = 'random', fraction=0.75, n_samples=300)

2024-09-24 16:42:49 : Training AutoML...
2024-09-24 16:44:15 : Best model is lgbm with best model parameters of {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}
2024-09-24 16:44:15 : Subtracting base...
2024-09-24 16:44:59 : Subtracting date_unix... ETA: 2.93 minutes
2024-09-24 16:45:41 : Subtracting day_julian... ETA: 2.16 minutes
2024-09-24 16:46:27 : Subtracting weekday... ETA: 1.46 minutes
2024-09-24 16:47:12 : Subtracting hour... ETA: 44.25 seconds

[36]:

df_dewca

[36]:

	observed	base	date_unix	day_julian	weekday	hour	deweathered	emi_noise
date
2020-01-01 00:00:00	58.1	9.834232	18.762883	3.617389	0.991752	0.271323	24.320077	0.676729
2020-01-01 01:00:00	43.2	9.211363	19.549169	3.381027	0.795226	0.420953	24.200235	0.053860
2020-01-01 02:00:00	43.0	8.740657	18.760258	4.192777	0.800782	-0.187559	23.149413	-0.416846
2020-01-01 03:00:00	42.8	8.869214	18.804496	3.959970	0.529601	-0.208110	22.797669	-0.288289
2020-01-01 04:00:00	36.8	8.184215	19.394484	3.038679	0.761000	0.107604	22.328479	-0.973288
...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	11.7	8.621910	12.759990	0.336607	-0.046839	0.406496	12.920661	-0.535593
2020-12-31 20:00:00	11.0	8.749583	12.430059	0.528524	-0.321991	0.086078	12.314750	-0.407920
2020-12-31 21:00:00	15.3	8.957951	12.361814	0.078852	-0.219580	0.304223	12.325758	-0.199552
2020-12-31 22:00:00	17.1	11.150086	10.425434	0.642186	-0.419770	0.013659	12.654092	1.992583
2020-12-31 23:00:00	15.2	8.579548	12.403958	0.512501	0.115038	-0.705544	11.747998	-0.577955

6373 rows × 8 columns

[37]:

df_dewca, mod_stats=nm.decom_emi(df1a, model=model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)

2024-09-24 16:47:57 : Subtracting base...
2024-09-24 16:48:43 : Subtracting date_unix... ETA: 3.06 minutes
2024-09-24 16:49:30 : Subtracting day_julian... ETA: 2.31 minutes
2024-09-24 16:50:13 : Subtracting weekday... ETA: 1.51 minutes
2024-09-24 16:50:54 : Subtracting hour... ETA: 44.25 seconds

[38]:

df_dewcb, mod_stats=nm.decom_met(df1, value='PM2.5',feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)

2024-09-24 16:51:38 : Training AutoML...
2024-09-24 16:53:13 : Best model is lgbm with best model parameters of {'n_estimators': 4779, 'num_leaves': 37, 'min_child_samples': 13, 'learning_rate': 0.05353325544814332, 'log_max_bin': 10, 'colsample_bytree': 0.7006661480744041, 'reg_alpha': 3.4963871783049667, 'reg_lambda': 0.12148015741620988}
2024-09-24 16:53:14 : Subtracting deweathered...
2024-09-24 16:53:59 : Subtracting v10... ETA: 7.63 minutes
2024-09-24 16:54:42 : Subtracting blh... ETA: 6.60 minutes
2024-09-24 16:55:24 : Subtracting u10... ETA: 5.79 minutes
2024-09-24 16:56:05 : Subtracting sp... ETA: 4.98 minutes
2024-09-24 16:56:45 : Subtracting rh2m... ETA: 4.22 minutes
2024-09-24 16:57:25 : Subtracting tcc... ETA: 3.49 minutes
2024-09-24 16:58:05 : Subtracting d2m... ETA: 2.78 minutes
2024-09-24 16:58:44 : Subtracting t2m... ETA: 2.07 minutes
2024-09-24 16:59:24 : Subtracting ssrd... ETA: 1.37 minutes
2024-09-24 17:00:06 : Subtracting tp... ETA: 41.25 seconds

[39]:

df_dewcb, mod_stats=nm.decom_met(df1a, model=model1, feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300,fraction=0.75, seed=7654321)

2024-09-24 17:00:49 : Subtracting deweathered...
2024-09-24 17:01:32 : Subtracting v10... ETA: 7.12 minutes
2024-09-24 17:02:15 : Subtracting blh... ETA: 6.50 minutes
2024-09-24 17:03:01 : Subtracting u10... ETA: 5.86 minutes
2024-09-24 17:03:45 : Subtracting sp... ETA: 5.13 minutes
2024-09-24 17:04:29 : Subtracting rh2m... ETA: 4.40 minutes
2024-09-24 17:05:11 : Subtracting tcc... ETA: 3.64 minutes
2024-09-24 17:05:53 : Subtracting d2m... ETA: 2.89 minutes
2024-09-24 17:06:38 : Subtracting t2m... ETA: 2.18 minutes
2024-09-24 17:07:17 : Subtracting ssrd... ETA: 1.44 minutes
2024-09-24 17:07:57 : Subtracting tp... ETA: 42.86 seconds

[40]:

df_dewcb

[40]:

	observed	deweathered	v10	blh	u10	sp	rh2m	tcc	d2m	t2m	ssrd	tp	met_noise
date
2020-01-01 00:00:00	58.1	23.775707	2.803204	2.690817	10.533843	14.459222	6.452000	5.053862	4.216987	5.003190	4.792167	5.521003	1.596199
2020-01-01 01:00:00	43.2	23.712904	1.546627	4.394804	11.799731	10.566854	1.769647	1.297532	1.966014	2.295684	1.542771	1.862642	-0.930420
2020-01-01 02:00:00	43.0	23.308907	1.740158	6.840758	12.729884	8.339736	1.449420	1.877752	2.321789	0.919309	-0.113854	2.106147	-0.392609
2020-01-01 03:00:00	42.8	21.808085	2.402217	10.777716	12.743709	5.307592	1.729576	2.229663	2.012881	0.785298	0.305861	1.625649	0.265997
2020-01-01 04:00:00	36.8	22.173301	1.698096	8.612109	9.510499	3.318336	1.694841	2.583169	2.313238	0.499432	-0.465211	1.463167	-1.849514
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	11.7	12.869070	0.146833	-0.498743	-0.443972	0.866222	0.683517	-0.338941	-0.219422	-0.602156	-1.466656	-0.560928	-0.034524
2020-12-31 20:00:00	11.0	12.018398	0.320049	-0.056764	-0.113305	0.966535	0.720601	-0.261063	-0.324155	-1.152303	-1.581597	-0.370613	-0.144190
2020-12-31 21:00:00	15.3	12.231738	0.397497	-0.242102	-0.911311	0.589118	0.786209	-0.246796	-0.040521	-0.713239	-1.326826	-0.321144	4.002425
2020-12-31 22:00:00	17.1	12.514066	0.676143	4.688038	1.570879	-1.779531	1.186440	0.857997	1.715646	0.982372	-0.903092	-0.228862	0.065919
2020-12-31 23:00:00	15.2	11.770696	0.539106	5.628566	2.067334	-2.426622	0.414002	0.017931	1.326883	0.882671	-0.988648	-0.456333	-0.216909

6373 rows × 13 columns

[41]:

df_dewca, mod_stats=nm.decom_emi(df1a, model=model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'], n_samples=300)

2024-09-24 17:08:35 : Subtracting base...
2024-09-24 17:09:22 : Subtracting date_unix... ETA: 3.10 minutes
2024-09-24 17:10:09 : Subtracting day_julian... ETA: 2.36 minutes
2024-09-24 17:10:55 : Subtracting weekday... ETA: 1.56 minutes
2024-09-24 17:11:40 : Subtracting hour... ETA: 46.26 seconds

[42]:

df_dewca

[42]:

	observed	base	date_unix	day_julian	weekday	hour	deweathered	emi_noise
date
2020-01-01 00:00:00	58.1	9.636890	18.471507	3.981692	1.373361	-0.159863	24.146201	0.479505
2020-01-01 01:00:00	43.2	9.560233	18.945920	3.962606	0.950994	-0.031199	24.231168	0.402847
2020-01-01 02:00:00	43.0	9.457613	18.446638	3.478760	1.149818	-0.735301	22.640142	0.300227
2020-01-01 03:00:00	42.8	9.553435	17.988441	4.450814	0.201102	-0.431935	22.604472	0.396050
2020-01-01 04:00:00	36.8	8.259178	18.887683	3.579915	1.171948	-0.676739	22.064599	-0.898208
...	...	...	...	...	...	...	...	...
2020-12-31 19:00:00	11.7	8.791405	12.619534	0.570769	-0.282182	0.277319	12.819460	-0.365980
2020-12-31 20:00:00	11.0	8.768305	12.125637	0.483223	0.000221	-0.067327	12.152672	-0.389080
2020-12-31 21:00:00	15.3	8.492577	12.543496	0.251146	0.147919	-0.081575	12.196178	-0.664809
2020-12-31 22:00:00	17.1	9.704561	12.125288	0.186412	-0.348574	0.029269	12.539570	0.547175
2020-12-31 23:00:00	15.2	8.273833	12.567619	0.757832	-0.068558	-0.986782	11.386559	-0.883552

6373 rows × 8 columns

Rolling weather normalisation

[43]:

df_dewc1, mod_stats=nm.rolling(df1a, model1,feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour'],variables_resample=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], n_samples=100,window_days=14, rolling_every=7)

2024-09-24 17:12:28: Rolling window 0 from 2020-01-01 to 2020-01-14
2024-09-24 17:12:37: Rolling window 10 from 2020-03-03 to 2020-03-16
2024-09-24 17:12:47: Rolling window 20 from 2020-05-02 to 2020-05-15
2024-09-24 17:12:55: Rolling window 30 from 2020-07-01 to 2020-07-14
2024-09-24 17:13:02: Rolling window 40 from 2020-09-17 to 2020-09-30

[44]:

df_dewc1.head()

[44]:

	observed	rolling_0	rolling_1	rolling_2	rolling_3	rolling_4	rolling_5	rolling_6	rolling_7	rolling_8	...	rolling_35	rolling_36	rolling_37	rolling_38	rolling_39	rolling_40	rolling_41	rolling_42	rolling_43	rolling_44
date
2020-01-01 00:00:00	58.1	20.207316	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 01:00:00	43.2	20.131948	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 02:00:00	43.0	19.247751	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 03:00:00	42.8	19.310005	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01 04:00:00	36.8	20.072632	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 46 columns

[45]:

df_dewc1.iloc[:,1:].plot()

[45]:

<Axes: xlabel='date'>

Partial Dependence Plots

[46]:

df1a=nm.prepare_data(df1, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'], split_method='random',  fraction=0.75, seed=7654321)

[47]:

df1a

[47]:

	rowid	d2m	blh	ssrd	t2m	v10	u10	sp	tp	tcc	rh2m	date	value	date_unix	day_julian	weekday	hour	set
0	0	277.183465	384.209053	-1.164153e-10	278.394725	1.545010	-2.720528	102252.303312	0.000008	0.650958	91.884130	2020-01-01 00:00:00	58.1	1.577837e+09	1	3	0	training
1	1	276.695430	353.220263	-1.164153e-10	277.772899	1.282742	-2.308789	102211.168636	0.000002	0.603699	92.715877	2020-01-01 01:00:00	43.2	1.577840e+09	1	3	1	training
2	2	276.505662	255.911846	-1.164153e-10	277.463419	0.758730	-2.216471	102174.855967	0.000005	0.710378	93.485560	2020-01-01 02:00:00	43.0	1.577844e+09	1	3	2	testing
3	3	276.412816	191.375560	-1.164153e-10	277.305813	0.509013	-1.928623	102166.786485	0.000005	0.837765	93.906363	2020-01-01 03:00:00	42.8	1.577848e+09	1	3	3	training
4	4	276.553051	151.780210	-1.164153e-10	277.478941	0.607069	-1.700043	102142.578039	0.000003	0.819103	93.696878	2020-01-01 04:00:00	36.8	1.577851e+09	1	3	4	testing
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6368	6368	272.197565	476.945688	-5.820766e-11	273.557442	-1.945195	1.380939	99902.506413	0.000000	0.918149	90.582979	2020-12-31 19:00:00	11.7	1.609441e+09	366	4	19	training
6369	6369	272.171041	486.665851	-5.820766e-11	273.629146	-2.102732	0.987925	99947.625909	0.000000	0.839639	89.939908	2020-12-31 20:00:00	11.0	1.609445e+09	366	4	20	training
6370	6370	272.087408	489.355002	-5.820766e-11	273.470592	-1.933668	0.681543	100000.215520	0.000000	0.739354	90.422188	2020-12-31 21:00:00	15.3	1.609448e+09	366	4	21	testing
6371	6371	272.235319	40.714872	-5.820766e-11	272.926062	-0.583816	1.020793	100042.844978	0.000000	0.643753	95.088677	2020-12-31 22:00:00	17.1	1.609452e+09	366	4	22	training
6372	6372	272.020979	55.617254	-5.820766e-11	272.681367	-0.377511	0.959517	100053.601944	0.000000	0.549403	95.290673	2020-12-31 23:00:00	15.2	1.609456e+09	366	4	23	training

6373 rows × 18 columns

[49]:

all_features=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']
pdp_value=nm.pdp(df1a,model1,var_list=['blh'])

[50]:

pdp_value

[50]:

	variable	value	pdp_mean	pdp_std
0	blh	73.415911	15.742461	8.358786
1	blh	88.917320	15.359839	8.271418
2	blh	104.418730	14.966788	8.354652
3	blh	119.920140	15.582338	8.428540
4	blh	135.421549	13.699515	7.471944
...	...	...	...	...
95	blh	1546.049822	6.940856	4.955734
96	blh	1561.551231	6.955089	4.957737
97	blh	1577.052641	6.960697	4.952725
98	blh	1592.554051	7.132266	4.943666
99	blh	1608.055460	7.126766	4.943877

100 rows × 4 columns

[51]:

pdp_value=nm.pdp(df1a,model1,var_list=['blh','t2m'])

[52]:

pdp_value

[52]:

	variable	value	pdp_mean	pdp_std
0	blh	73.415911	15.742461	8.358786
1	blh	88.917320	15.359839	8.271418
2	blh	104.418730	14.966788	8.354652
3	blh	119.920140	15.582338	8.428540
4	blh	135.421549	13.699515	7.471944
...	...	...	...	...
195	t2m	294.518468	10.544835	7.865555
196	t2m	294.715875	10.532480	7.856032
197	t2m	294.913281	10.565610	7.847180
198	t2m	295.110688	10.527926	7.840150
199	t2m	295.308095	10.515571	7.841594

200 rows × 4 columns

[53]:

all_features=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']
pdp_value=nm.pdp(df1a,model1)

[54]:

pdp_value

[54]:

	variable	value	pdp_mean	pdp_std
0	weekday	1.0	8.808568	7.348096
1	weekday	2.0	8.975131	7.397715
2	weekday	3.0	9.519126	7.803357
3	weekday	4.0	9.178989	7.852847
4	weekday	5.0	9.756207	7.741670
...	...	...	...	...
1226	hour	19.0	9.648269	7.468077
1227	hour	20.0	9.583308	7.474737
1228	hour	21.0	9.275591	7.467640
1229	hour	22.0	9.089483	7.493595
1230	hour	23.0	8.733808	7.531601

1231 rows × 4 columns

[ ]: