Skip to content

Speed Prediction

In this project, we will use weather data to predict average speed.

  • we will use weather data from openweathermap.org which provides temperature, wind, humidity, and weather conditions

  • This notebook will be running on Google Colab

Some of the data preprocessing techniques are based on the online notebooks from tensorflow org.

https://www.tensorflow.org/tutorials/structured_data/time_series

Install and import dependencies

!pip install pactools
Collecting pactools
[?25l  Downloading https://files.pythonhosted.org/packages/17/14/4c4eba6e54408e536be27b9891cea68ea391d7d190936593aa71e5c6405e/pactools-0.3.1-py3-none-any.whl (82kB)
     |████████████████████████████████| 92kB 3.8MB/s 
[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from pactools) (3.2.2)
Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from pactools) (2.10.0)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from pactools) (0.22.2.post1)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from pactools) (1.18.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from pactools) (1.4.1)
Collecting mne
[?25l  Downloading https://files.pythonhosted.org/packages/4d/0e/6448521738d3357c205795fd5846d023bd7935bb83ba93a1ba0f7124205e/mne-0.21.2-py3-none-any.whl (6.8MB)
     |████████████████████████████████| 6.8MB 8.5MB/s 
[?25hRequirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (2.8.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (2.4.7)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (1.3.1)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from h5py->pactools) (1.15.0)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->pactools) (0.17.0)
Installing collected packages: mne, pactools
Successfully installed mne-0.21.2 pactools-0.3.1
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from pactools.grid_search import GridSearchCVProgressBar
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
/usr/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
  return f(*args, **kwds)
/usr/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
  return f(*args, **kwds)
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
Num GPUs Available:  1

Preprocess

Asign Holiday to the table

def time_desc(x: datetime):
    Early_Morning = [4,5,6,7]
    Morning = [8,9,10,11]
    Afternoon = [12,13,14,15]
    Evening = [16,17,18,19]
    Night = [20,21,22,23]
    Late_Night = [0,1,2,3]


    if x.hour in Early_Morning:
        return 'early'
    elif x.hour in Morning:
        return 'morning'
    elif x.hour in Afternoon:
        return 'noon'
    elif x.hour in Evening:
        return 'afternoon'
    elif x.hour in Night:
        return 'night'
    else:
        return 'latenight'

def add_holiday_and_weekend(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add holiday and weekend to the dataset
    """
    new_df = df.copy()
    new_df['IsWeekend'] = new_df['date'].apply(lambda x:0 if x.weekday() in [0,1,2,3,4] else 1)
    new_df['IsHoliday']=new_df['date'].apply(lambda x:1 if (x.date().strftime('%Y-%m-%d') in [
           '2017-01-02','2017-01-28','2017-01-30','2017-01-31','2017-04-04',
           '2017-04-14','2017-04-15','2017-04-17','2017-05-01','2017-05-03',
           '2017-05-30','2017-07-01','2017-10-02','2017-10-05','2017-10-28',
           '2017-12-25','2017-12-26',
           '2018-01-01','2018-02-16','2018-02-17','2018-02-19','2018-03-30',
           '2018-03-31','2018-04-02','2018-04-05','2018-05-01','2018-05-22',
           '2018-06-18','2018-07-02','2018-09-25','2018-10-01','2018-10-17',
           '2018-12-25','2018-12-26'
           ])
           or(x.weekday() in[6]) else 0)
    return new_df

I am using google drive to store all the data including the weather data. So please change this line to the your file paths.

# change following two lines to your paths
train = pd.read_csv('/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/test.csv')

train = train.drop('id', axis=1)
train['date'] = pd.to_datetime(train['date'])
train = add_holiday_and_weekend(train)
train['time desc']=train['date'].apply(time_desc)

train
date speed IsWeekend IsHoliday time desc
0 2017-01-01 00:00:00 43.002930 1 1 latenight
1 2017-01-01 01:00:00 46.118696 1 1 latenight
2 2017-01-01 02:00:00 44.294158 1 1 latenight
3 2017-01-01 03:00:00 41.067468 1 1 latenight
4 2017-01-01 04:00:00 46.448653 1 1 early
... ... ... ... ... ...
14001 2018-12-31 12:00:00 19.865269 0 0 noon
14002 2018-12-31 15:00:00 17.820375 0 0 noon
14003 2018-12-31 16:00:00 12.501851 0 0 afternoon
14004 2018-12-31 18:00:00 15.979319 0 0 afternoon
14005 2018-12-31 20:00:00 40.594183 0 0 night

14006 rows × 5 columns

train.plot(x='date', y='speed', figsize=(20, 10))
<matplotlib.axes._subplots.AxesSubplot at 0x7f26b0215400>

png


Merge weather

Pre-process weather data

from datetime import datetime
def k_to_c(x):
    return x - 273.15

we have two different weather sources.

  • Weather from open weather (Provides hourly basis weather report)
  • Weather from Hong Kong observatory (Provides daily basis weather report)

Based on the experiments, I decide to use Hong Kong observatory's weather report which will bring a higher accuracy

# Change this path to yours
# weather = pd.read_csv("/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/hongkong_weather_1970-2020.csv")
# weather['dt_iso'] = weather['dt_iso'].apply(lambda x: x.replace('UTC', ''))
# weather['date'] = pd.to_datetime(weather['dt_iso']).dt.tz_convert("Asia/Hong_Kong").dt.tz_localize(None)

# # Transform unit
# weather['temp'] = weather['temp'].apply(k_to_c)
# weather['feels_like'] = weather['feels_like'].apply(k_to_c)
# weather['temp_min'] = weather['temp_min'].apply(k_to_c)
# weather['temp_max'] = weather['temp_max'].apply(k_to_c)

# weather = weather.drop(["dt_iso", "dt", "weather_icon", "rain_1h", "rain_3h", "snow_1h", "snow_3h", "sea_level", "grnd_level", "timezone", "lat", "lon"], axis=1)
# mask = (weather['date'] >= datetime(2017,1,1)) & (weather['date'] <= datetime(2019, 1, 1))
# weather = weather.loc[mask]

# weather
weather=pd.read_csv("/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/wr.csv")

weather['date'] = pd.to_datetime(weather.year*10000+weather.month*100+weather.day,format='%Y%m%d')
weather.drop(['Unnamed: 0', 'year', 'month', 'day'],axis=1,inplace=True)
weather['TRainfall(mm)']=weather['TRainfall(mm)'].apply(lambda x:0 if x in ['-','Trace'] else x)
weather.head()
MPressure(hPa) MaxTemp(℃) MinTemp(℃) MCloud(%) TRainfall(mm) #hRedVisi(h) WindDirect(degrees) MWindSpeed(km/h) date
0 1021.7 20.8 18.4 72 0 0 60 34.2 2017-01-01
1 1020.2 23.3 18.4 28 0 0 70 17.7 2017-01-02
2 1019.8 21.3 18.9 56 0 5 70 26.1 2017-01-03
3 1018.7 21.7 18.7 51 0 0 70 27.7 2017-01-04
4 1016.9 23.4 18.9 61 0 0 40 14.3 2017-01-05

Merge

from pandas import DatetimeIndex
def merge_weather(df: pd.DataFrame, weather: pd.DataFrame, how='left') -> pd.DataFrame:
    '''
    Merge weather with data.
    '''
    new_df = df.copy()
    new_weather = weather.copy()

    # new_df['tmp_date'] = new_df['date']
    # new_weather['tmp_date'] = new_weather['date']

    new_df['tmp_date'] = new_df['date'].apply(lambda date: date.date())
    new_weather['tmp_date'] = new_weather['date'].apply(lambda date: date.date())

    new_training_data = new_df.merge(new_weather, on='tmp_date', how=how)
    new_training_data = new_training_data.drop(['tmp_date', 'date_y'], axis=1)
    new_training_data = new_training_data.rename(columns={'date_x': 'date'})

    new_training_data['hour'] = DatetimeIndex(new_training_data['date']).hour
    new_training_data['day'] = DatetimeIndex(new_training_data['date']).day
    new_training_data['month'] = DatetimeIndex(new_training_data['date']).month
    new_training_data['year'] = DatetimeIndex(new_training_data['date']).year
    new_training_data['weekday'] = DatetimeIndex(new_training_data['date']).weekday
    return new_training_data
new_training_data = merge_weather(train, weather)

new_training_data
date speed IsWeekend IsHoliday time desc MPressure(hPa) MaxTemp(℃) MinTemp(℃) MCloud(%) TRainfall(mm) #hRedVisi(h) WindDirect(degrees) MWindSpeed(km/h) hour day month year weekday
0 2017-01-01 00:00:00 43.002930 1 1 latenight 1021.7 20.8 18.4 72 0 0 60 34.2 0 1 1 2017 6
1 2017-01-01 01:00:00 46.118696 1 1 latenight 1021.7 20.8 18.4 72 0 0 60 34.2 1 1 1 2017 6
2 2017-01-01 02:00:00 44.294158 1 1 latenight 1021.7 20.8 18.4 72 0 0 60 34.2 2 1 1 2017 6
3 2017-01-01 03:00:00 41.067468 1 1 latenight 1021.7 20.8 18.4 72 0 0 60 34.2 3 1 1 2017 6
4 2017-01-01 04:00:00 46.448653 1 1 early 1021.7 20.8 18.4 72 0 0 60 34.2 4 1 1 2017 6
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14001 2018-12-31 12:00:00 19.865269 0 0 noon 1027.0 15.6 11.8 77 0 0 360 26.8 12 31 12 2018 0
14002 2018-12-31 15:00:00 17.820375 0 0 noon 1027.0 15.6 11.8 77 0 0 360 26.8 15 31 12 2018 0
14003 2018-12-31 16:00:00 12.501851 0 0 afternoon 1027.0 15.6 11.8 77 0 0 360 26.8 16 31 12 2018 0
14004 2018-12-31 18:00:00 15.979319 0 0 afternoon 1027.0 15.6 11.8 77 0 0 360 26.8 18 31 12 2018 0
14005 2018-12-31 20:00:00 40.594183 0 0 night 1027.0 15.6 11.8 77 0 0 360 26.8 20 31 12 2018 0

14006 rows × 18 columns

Plot data

plt.figure(figsize=(6,4))
sns.boxplot('speed',data=new_training_data,orient='h',palette="Set3",linewidth=2.5)
plt.show()

Traffic speed

data_plot = new_training_data
data_plot['month'] = data_plot['date'].dt.month
data_plot
tmp_data=new_training_data.groupby('month').aggregate({'speed':'mean'})
plt.figure(figsize=(8,6))
sns.lineplot(x=tmp_data.index,y=tmp_data.speed,data=tmp_data,palette="Set2")
plt.show()
plt.figure(figsize=(8,6))
sns.countplot(y='time desc',data=new_training_data,palette=["#7fcdbb","#edf8b1","#fc9272","#fee0d2","#bcbddc","#efedf5"])
plt.show()
new_training_data.hist(bins=50,figsize=(20,15))
plt.show()

Train

new_training_data.columns
Index(['date', 'speed', 'IsWeekend', 'IsHoliday', 'time desc',
       'MPressure(hPa)', 'MaxTemp(℃)', 'MinTemp(℃)', 'MCloud(%)',
       'TRainfall(mm)', '#hRedVisi(h)', 'WindDirect(degrees)',
       'MWindSpeed(km/h)', 'hour', 'day', 'month', 'year', 'weekday'],
      dtype='object')
# using open weather dataset
# cat_vars=['IsWeekend','IsHoliday','time desc']
# num_vars=['temp', 'pressure', 'wind_speed', 'humidity', "clouds_all", 'month', 'year', 'day', 'hour']

#using hong kong government dataset
cat_vars=['IsWeekend','IsHoliday','time desc', "weekday", "day",  'month', 'year']
num_vars=['MaxTemp(℃)','MinTemp(℃)','MCloud(%)',
          'TRainfall(mm)','#hRedVisi(h)','WindDirect(degrees)',
          'MWindSpeed(km/h)', 'hour']

Transform Data

numeric_transformer=Pipeline(steps=[
    ('scaler', MinMaxScaler())])
categorical_transformer=Pipeline(steps=[
    ('oneHot',OneHotEncoder(sparse=False))])

preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_transformer,num_vars),
    ('cat',categorical_transformer,cat_vars)])

data_transformed=preprocessor.fit_transform(new_training_data)
print(data_transformed.shape)
(14006, 70)
y = new_training_data['speed']
# y = y.to_numpy()
# y = y.reshape(-1, 1)
# print(y.shape)
# print(y)

We want to scale the speed

import numpy as np

Split data

X_train,X_test,y_train,y_test=train_test_split(data_transformed,y,test_size=0.15,random_state=42)
print(X_train)
print(y_train)
print(f"Train x shape: {X_train.shape}")
print(f"Train y shape: {y_train.shape}")
[[0.36923077 0.42608696 0.70833333 ... 0.         0.         1.        ]
 [0.83076923 0.90434783 0.78125    ... 0.         1.         0.        ]
 [0.84230769 0.95652174 0.5625     ... 0.         1.         0.        ]
 ...
 [0.84230769 0.95652174 0.5625     ... 0.         1.         0.        ]
 [0.76923077 0.94782609 0.875      ... 0.         1.         0.        ]
 [0.55384615 0.53043478 0.48958333 ... 0.         1.         0.        ]]
9443     46.813428
4822     15.018334
5391     48.414942
11099    19.663880
7854     10.389012
           ...    
5191     24.637279
13418    35.306286
5390     46.950711
860      46.623319
7270     16.718626
Name: speed, Length: 11905, dtype: float64
Train x shape: (11905, 70)
Train y shape: (11905,)

Training by using XGBRegression

tscv = TimeSeriesSplit(n_splits=3)
model = XGBRegressor()

param_grid = {'nthread':[4,6,8], 
              'objective':['reg:squarederror'],
              'learning_rate':[.03, 0.05, .07],
              'max_depth':[5, 6, 7],
              'min_child_weight':[4],
              'subsample':[0.7],
              'colsample_bytree':[0.7],
              'n_estimators':[500]}

xgb=GridSearchCV(estimator=model,param_grid=param_grid,cv=tscv,n_jobs=4, verbose=10)
xgb.fit(X_train,y_train)
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-54-e4f74d5b703b> in <module>()
     12 
     13 xgb=GridSearchCV(estimator=model,param_grid=param_grid,cv=tscv,n_jobs=4, verbose=10)
---> 14 xgb.fit(X_train,y_train)


/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    708                 return results
    709 
--> 710             self._run_search(evaluate_candidates)
    711 
    712         # For multi-metric evaluation, store the best_index_, best_params_ and


/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1149     def _run_search(self, evaluate_candidates):
   1150         """Search all candidates in param_grid"""
-> 1151         evaluate_candidates(ParameterGrid(self.param_grid))
   1152 
   1153


/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    687                                for parameters, (train, test)
    688                                in product(candidate_params,
--> 689                                           cv.split(X, y, groups)))
    690 
    691                 if len(out) < 1:


/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1059 
   1060             with self._backend.retrieval_context():
-> 1061                 self.retrieve()
   1062             # Make sure that we get a last message telling us we are done
   1063             elapsed_time = time.time() - self._start_time


/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in retrieve(self)
    938             try:
    939                 if getattr(self._backend, 'supports_timeout', False):
--> 940                     self._output.extend(job.get(timeout=self.timeout))
    941                 else:
    942                     self._output.extend(job.get())


/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e


/usr/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
    425                 return self.__get_result()
    426 
--> 427             self._condition.wait(timeout)
    428 
    429             if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:


/usr/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:


KeyboardInterrupt:

Training by using DNN

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras import regularizers

print(X_train.shape)
(11905, 70)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
model = keras.Sequential(
    [

        layers.Dense(128, activation="relu",  
                     input_shape=(X_train.shape[1],),),
         layers.Dense(128, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(64, activation="relu"),

        layers.Dense(1),
    ]
)

opt = keras.optimizers.Adam(learning_rate=0.001)

model.compile(loss='mean_squared_error',
              optimizer=opt, metrics=['mae', 'mse'])
history = model.fit(X_train, y_train, epochs=200, validation_split=0.2, callbacks=[early_stop],)
Epoch 1/200
298/298 [==============================] - 1s 4ms/step - loss: 169.2792 - mae: 9.2261 - mse: 169.2792 - val_loss: 49.0226 - val_mae: 5.3678 - val_mse: 49.0226
Epoch 2/200
298/298 [==============================] - 1s 3ms/step - loss: 53.3526 - mae: 5.5417 - mse: 53.3526 - val_loss: 45.0271 - val_mae: 5.0866 - val_mse: 45.0271
Epoch 3/200
298/298 [==============================] - 1s 3ms/step - loss: 49.2738 - mae: 5.3108 - mse: 49.2738 - val_loss: 43.7640 - val_mae: 5.1142 - val_mse: 43.7640
Epoch 4/200
298/298 [==============================] - 1s 4ms/step - loss: 46.0532 - mae: 5.1287 - mse: 46.0532 - val_loss: 40.5389 - val_mae: 4.7757 - val_mse: 40.5389
Epoch 5/200
298/298 [==============================] - 1s 3ms/step - loss: 42.8026 - mae: 4.9785 - mse: 42.8026 - val_loss: 40.7752 - val_mae: 4.9557 - val_mse: 40.7752
Epoch 6/200
298/298 [==============================] - 1s 4ms/step - loss: 40.8385 - mae: 4.9025 - mse: 40.8385 - val_loss: 38.6987 - val_mae: 4.7153 - val_mse: 38.6987
Epoch 7/200
298/298 [==============================] - 1s 3ms/step - loss: 38.1873 - mae: 4.7325 - mse: 38.1873 - val_loss: 38.2969 - val_mae: 4.6748 - val_mse: 38.2969
Epoch 8/200
298/298 [==============================] - 1s 4ms/step - loss: 37.1973 - mae: 4.6986 - mse: 37.1973 - val_loss: 37.6423 - val_mae: 4.6165 - val_mse: 37.6423
Epoch 9/200
298/298 [==============================] - 1s 4ms/step - loss: 35.9464 - mae: 4.6294 - mse: 35.9464 - val_loss: 39.2058 - val_mae: 4.9141 - val_mse: 39.2058
Epoch 10/200
298/298 [==============================] - 1s 3ms/step - loss: 34.3299 - mae: 4.5449 - mse: 34.3299 - val_loss: 37.6090 - val_mae: 4.7696 - val_mse: 37.6090
Epoch 11/200
298/298 [==============================] - 1s 3ms/step - loss: 32.5047 - mae: 4.4394 - mse: 32.5047 - val_loss: 37.4612 - val_mae: 4.8729 - val_mse: 37.4612
Epoch 12/200
298/298 [==============================] - 1s 4ms/step - loss: 32.2329 - mae: 4.4059 - mse: 32.2329 - val_loss: 41.9161 - val_mae: 5.2683 - val_mse: 41.9161
Epoch 13/200
298/298 [==============================] - 1s 4ms/step - loss: 31.2041 - mae: 4.3138 - mse: 31.2041 - val_loss: 35.0004 - val_mae: 4.6513 - val_mse: 35.0004
Epoch 14/200
298/298 [==============================] - 1s 3ms/step - loss: 30.4892 - mae: 4.2764 - mse: 30.4892 - val_loss: 33.7725 - val_mae: 4.4478 - val_mse: 33.7725
Epoch 15/200
298/298 [==============================] - 1s 4ms/step - loss: 29.2411 - mae: 4.1746 - mse: 29.2411 - val_loss: 32.9828 - val_mae: 4.2607 - val_mse: 32.9828
Epoch 16/200
298/298 [==============================] - 1s 4ms/step - loss: 28.5754 - mae: 4.1273 - mse: 28.5754 - val_loss: 32.2138 - val_mae: 4.2323 - val_mse: 32.2138
Epoch 17/200
298/298 [==============================] - 1s 4ms/step - loss: 27.3392 - mae: 4.0219 - mse: 27.3392 - val_loss: 32.4703 - val_mae: 4.4164 - val_mse: 32.4703
Epoch 18/200
298/298 [==============================] - 1s 3ms/step - loss: 27.5931 - mae: 4.0547 - mse: 27.5931 - val_loss: 31.8828 - val_mae: 4.2749 - val_mse: 31.8828
Epoch 19/200
298/298 [==============================] - 1s 3ms/step - loss: 26.7623 - mae: 3.9968 - mse: 26.7623 - val_loss: 32.7509 - val_mae: 4.3703 - val_mse: 32.7509
Epoch 20/200
298/298 [==============================] - 1s 4ms/step - loss: 26.7770 - mae: 3.9945 - mse: 26.7770 - val_loss: 31.6045 - val_mae: 4.2057 - val_mse: 31.6045
Epoch 21/200
298/298 [==============================] - 1s 3ms/step - loss: 25.9685 - mae: 3.9541 - mse: 25.9685 - val_loss: 31.0973 - val_mae: 4.1812 - val_mse: 31.0973
Epoch 22/200
298/298 [==============================] - 1s 4ms/step - loss: 26.3720 - mae: 3.9701 - mse: 26.3720 - val_loss: 31.9482 - val_mae: 4.2127 - val_mse: 31.9482
Epoch 23/200
298/298 [==============================] - 1s 3ms/step - loss: 25.3420 - mae: 3.9054 - mse: 25.3420 - val_loss: 34.5661 - val_mae: 4.6946 - val_mse: 34.5661
Epoch 24/200
298/298 [==============================] - 1s 4ms/step - loss: 25.2177 - mae: 3.8832 - mse: 25.2177 - val_loss: 31.2380 - val_mae: 4.1930 - val_mse: 31.2380
Epoch 25/200
298/298 [==============================] - 1s 4ms/step - loss: 24.5446 - mae: 3.8306 - mse: 24.5446 - val_loss: 31.1003 - val_mae: 4.2247 - val_mse: 31.1003
Epoch 26/200
298/298 [==============================] - 1s 4ms/step - loss: 24.4506 - mae: 3.8159 - mse: 24.4506 - val_loss: 31.3635 - val_mae: 4.3589 - val_mse: 31.3635
Epoch 27/200
298/298 [==============================] - 1s 4ms/step - loss: 24.0837 - mae: 3.7934 - mse: 24.0837 - val_loss: 30.2161 - val_mae: 4.2576 - val_mse: 30.2161
Epoch 28/200
298/298 [==============================] - 1s 3ms/step - loss: 23.7172 - mae: 3.7904 - mse: 23.7172 - val_loss: 30.6251 - val_mae: 4.1824 - val_mse: 30.6251
Epoch 29/200
298/298 [==============================] - 1s 3ms/step - loss: 23.0786 - mae: 3.7103 - mse: 23.0786 - val_loss: 30.5365 - val_mae: 4.2229 - val_mse: 30.5365
Epoch 30/200
298/298 [==============================] - 1s 3ms/step - loss: 23.7149 - mae: 3.7750 - mse: 23.7149 - val_loss: 30.7331 - val_mae: 4.1697 - val_mse: 30.7331
Epoch 31/200
298/298 [==============================] - 1s 3ms/step - loss: 22.5444 - mae: 3.6918 - mse: 22.5444 - val_loss: 29.5841 - val_mae: 4.1091 - val_mse: 29.5841
Epoch 32/200
298/298 [==============================] - 1s 4ms/step - loss: 22.6646 - mae: 3.6896 - mse: 22.6646 - val_loss: 31.0110 - val_mae: 4.2368 - val_mse: 31.0110
Epoch 33/200
298/298 [==============================] - 1s 3ms/step - loss: 22.2019 - mae: 3.6553 - mse: 22.2019 - val_loss: 30.2204 - val_mae: 4.1981 - val_mse: 30.2204
Epoch 34/200
298/298 [==============================] - 1s 3ms/step - loss: 22.1869 - mae: 3.6523 - mse: 22.1869 - val_loss: 29.4552 - val_mae: 4.0781 - val_mse: 29.4552
Epoch 35/200
298/298 [==============================] - 1s 3ms/step - loss: 21.6420 - mae: 3.6126 - mse: 21.6420 - val_loss: 28.8039 - val_mae: 4.0706 - val_mse: 28.8039
Epoch 36/200
298/298 [==============================] - 1s 4ms/step - loss: 21.1637 - mae: 3.5743 - mse: 21.1637 - val_loss: 29.7665 - val_mae: 4.1830 - val_mse: 29.7665
Epoch 37/200
298/298 [==============================] - 1s 4ms/step - loss: 21.5559 - mae: 3.6128 - mse: 21.5559 - val_loss: 29.8333 - val_mae: 4.0771 - val_mse: 29.8333
Epoch 38/200
298/298 [==============================] - 1s 4ms/step - loss: 20.9312 - mae: 3.5428 - mse: 20.9312 - val_loss: 33.7653 - val_mae: 4.3892 - val_mse: 33.7653
Epoch 39/200
298/298 [==============================] - 1s 4ms/step - loss: 20.9336 - mae: 3.5552 - mse: 20.9336 - val_loss: 29.2746 - val_mae: 4.1145 - val_mse: 29.2746
Epoch 40/200
298/298 [==============================] - 1s 4ms/step - loss: 20.7350 - mae: 3.5443 - mse: 20.7350 - val_loss: 30.5803 - val_mae: 4.2725 - val_mse: 30.5803
Epoch 41/200
298/298 [==============================] - 1s 3ms/step - loss: 20.3824 - mae: 3.5076 - mse: 20.3824 - val_loss: 29.8183 - val_mae: 4.1737 - val_mse: 29.8183
Epoch 42/200
298/298 [==============================] - 1s 4ms/step - loss: 20.4556 - mae: 3.5163 - mse: 20.4556 - val_loss: 29.3429 - val_mae: 4.0838 - val_mse: 29.3429
Epoch 43/200
298/298 [==============================] - 1s 3ms/step - loss: 20.0906 - mae: 3.4789 - mse: 20.0906 - val_loss: 30.3515 - val_mae: 4.1084 - val_mse: 30.3515
Epoch 44/200
298/298 [==============================] - 1s 3ms/step - loss: 19.9412 - mae: 3.4736 - mse: 19.9412 - val_loss: 30.2394 - val_mae: 4.2345 - val_mse: 30.2394
Epoch 45/200
298/298 [==============================] - 1s 3ms/step - loss: 19.6319 - mae: 3.4261 - mse: 19.6319 - val_loss: 30.2833 - val_mae: 4.1500 - val_mse: 30.2833

plot

def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  plt.legend()
  plt.show()


plot_history(history)

png

png

Training by using Decision Tree

from sklearn import tree
# tree_clf = tree.DecisionTreeRegressor()
# tree_clf.fit(X_train, y_train)
tscv = TimeSeriesSplit(n_splits=3)
param_grid = {
    'max_depth': range(1, 10),
    'min_samples_split': range(1, 10),
    'min_samples_leaf': range(1, 5)
}
tree_model = tree.DecisionTreeRegressor()
tree_reg = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=tscv, n_jobs=4, verbose=10)
tree_reg.fit(X_train, y_train)
Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1742s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0517s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done  44 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0866s.) Setting batch_size=8.
[Parallel(n_jobs=4)]: Batch computation too fast (0.1193s.) Setting batch_size=16.
[Parallel(n_jobs=4)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 216 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 392 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 600 tasks      | elapsed:    5.4s
[Parallel(n_jobs=4)]: Done 808 tasks      | elapsed:    7.7s
[Parallel(n_jobs=4)]: Done 972 out of 972 | elapsed:    9.7s finished





GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=4,
             param_grid={'max_depth': range(1, 10),
                         'min_samples_leaf': range(1, 5),
                         'min_samples_split': range(1, 10)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

Training by using Ridge Regression

from sklearn import linear_model


param_grid = {'alpha': range(1, 10)}
reg_model = linear_model.Ridge(alpha=.5)

ridge = GridSearchCV(estimator=reg_model, param_grid=param_grid, cv=tscv, n_jobs=4, verbose=10)

ridge.fit(X_train, y_train)
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Batch computation too fast (0.0206s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0476s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  20 out of  27 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  23 out of  27 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    0.1s finished





GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score=nan,
             estimator=Ridge(alpha=0.5, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=4, param_grid={'alpha': range(1, 10)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

Training by using Random forest

model_forest = RandomForestRegressor()

param_grid2={'n_estimators':[10,50,100,1000],
             'max_features':range(1, 4)
            }

random_forest=GridSearchCV(estimator=model_forest,param_grid=param_grid2,cv=tscv,n_jobs=10, verbose=10)
random_forest.fit(X_train,y_train)
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    1.5s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    4.1s
[Parallel(n_jobs=10)]: Done  21 out of  36 | elapsed:    7.9s remaining:    5.7s
[Parallel(n_jobs=10)]: Done  25 out of  36 | elapsed:   10.6s remaining:    4.7s
[Parallel(n_jobs=10)]: Done  29 out of  36 | elapsed:   23.0s remaining:    5.6s
[Parallel(n_jobs=10)]: Done  33 out of  36 | elapsed:   35.6s remaining:    3.2s
[Parallel(n_jobs=10)]: Done  36 out of  36 | elapsed:   42.2s finished





GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=10,
             param_grid={'max_features': range(1, 4),
                         'n_estimators': [10, 50, 100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

Training by using SVM

model_svm = svm.SVR()

param_grid3={'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}

svr=GridSearchCV(estimator=model_svm,n_jobs=10, verbose=10, param_grid=param_grid3)
svr.fit(X_train,y_train, )
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:   48.8s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:  1.6min
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:  2.4min
[Parallel(n_jobs=10)]: Done  26 out of  40 | elapsed:  2.4min remaining:  1.3min
[Parallel(n_jobs=10)]: Done  31 out of  40 | elapsed:  3.2min remaining:   56.1s
[Parallel(n_jobs=10)]: Done  36 out of  40 | elapsed:  3.2min remaining:   21.6s
[Parallel(n_jobs=10)]: Done  40 out of  40 | elapsed:  3.3min finished





GridSearchCV(cv=None, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=10,
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

Training by using Neural network

from sklearn.neural_network import MLPRegressor

mlp_reg = MLPRegressor(max_iter=1000)
mlp_reg.fit(X_train, y_train)
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

Evaluation

y_pred = model.predict(X_test)
print(y_pred)
print(y_test)
[[35.947376]
 [19.731487]
 [46.31413 ]
 ...
 [40.15987 ]
 [14.448287]
 [32.13128 ]]
6405     17.479723
10497    19.555973
3669     48.410243
747      24.168746
902      44.950639
           ...    
11278    19.544291
2312      9.208905
6475     47.599270
6440      7.897116
10103    31.978004
Name: speed, Length: 2101, dtype: float64
# y_pred2 = xgb.predict(X_test)
# y_pred3 = random_forest.predict(X_test)
# y_pred4 = svr.predict(X_test)
# y_pred5 = tree_reg.predict(X_test)
# y_pred6 = ridge.predict(X_test)
# y_pred7 = mlp_reg.predict(X_test)
MSE_DNN=mean_squared_error(y_pred,y_test)
# MSE_rg=mean_squared_error(y_pred2,y_test)
# MSE_rf=mean_squared_error(y_pred3,y_test)
# MSE_tree=mean_squared_error(y_pred5,y_test)
# MSE_svr=mean_squared_error(y_pred4,y_test)
# MSE_ridge = mean_squared_error(y_pred6, y_test)
# MSE_mlp = mean_squared_error(y_pred7, y_test)

print(f"MSE for dnn is: {MSE_DNN}")
# print('MSE for XGBoost is '+str(MSE_rg))
# print('MSE for RandomForest is '+str(MSE_rf))
# print(f"MSE for decision is: {MSE_tree}")
# print(f"MSE for svr is: {MSE_svr}")
# print(f"MSE for ridge is: {MSE_ridge}")
# print(f"MSE for neural network is: {MSE_mlp}")
MSE for dnn is: 28.724601352433957

Based on the observation, we found that XGV will provide the most accurate results.

Make a prediction

test = pd.read_csv('/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/test.csv')
print(test.to_numpy().shape)
(3504, 2)
test['date'] = pd.to_datetime(test['date'])
test = add_holiday_and_weekend(test)
test['time desc']=test['date'].apply(time_desc)
print(test.shape)
(3504, 5)
new_test = merge_weather(test, weather, how='inner')
new_test = new_test.drop_duplicates(['date'], keep='last')
new_test
id date IsWeekend IsHoliday time desc MPressure(hPa) MaxTemp(℃) MinTemp(℃) MCloud(%) TRainfall(mm) #hRedVisi(h) WindDirect(degrees) MWindSpeed(km/h) hour day month year weekday
0 0 2018-01-01 02:00:00 0 1 latenight 1020.5 19.0 16.3 75 0 21 70 28.4 2 1 1 2018 0
1 1 2018-01-01 05:00:00 0 1 early 1020.5 19.0 16.3 75 0 21 70 28.4 5 1 1 2018 0
2 2 2018-01-01 07:00:00 0 1 early 1020.5 19.0 16.3 75 0 21 70 28.4 7 1 1 2018 0
3 3 2018-01-01 08:00:00 0 1 morning 1020.5 19.0 16.3 75 0 21 70 28.4 8 1 1 2018 0
4 4 2018-01-01 10:00:00 0 1 morning 1020.5 19.0 16.3 75 0 21 70 28.4 10 1 1 2018 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3499 3499 2018-12-31 17:00:00 0 0 afternoon 1027.0 15.6 11.8 77 0 0 360 26.8 17 31 12 2018 0
3500 3500 2018-12-31 19:00:00 0 0 afternoon 1027.0 15.6 11.8 77 0 0 360 26.8 19 31 12 2018 0
3501 3501 2018-12-31 21:00:00 0 0 night 1027.0 15.6 11.8 77 0 0 360 26.8 21 31 12 2018 0
3502 3502 2018-12-31 22:00:00 0 0 night 1027.0 15.6 11.8 77 0 0 360 26.8 22 31 12 2018 0
3503 3503 2018-12-31 23:00:00 0 0 night 1027.0 15.6 11.8 77 0 0 360 26.8 23 31 12 2018 0

3504 rows × 18 columns

transformed_data = preprocessor.fit_transform(new_test)
print(transformed_data.shape)
(1, 322368)
pred = model.predict(transformed_data.reshape(-1, 1))
print(pred.shape)
---------------------------------------------------------------------------

InvalidArgumentError                      Traceback (most recent call last)

<ipython-input-47-872a2584594e> in <module>()
----> 1 pred = model.predict(transformed_data.reshape(-1, 1))
      2 print(pred.shape)


/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
    128       raise ValueError('{} is not supported in multi-worker mode.'.format(
    129           method.__name__))
--> 130     return method(self, *args, **kwargs)
    131 
    132   return tf_decorator.make_decorator(


/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in predict(self, x, batch_size, verbose, steps, callbacks, max_queue_size, workers, use_multiprocessing)
   1597           for step in data_handler.steps():
   1598             callbacks.on_predict_batch_begin(step)
-> 1599             tmp_batch_outputs = predict_function(iterator)
   1600             if data_handler.should_sync:
   1601               context.async_wait()


/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    778       else:
    779         compiler = "nonXla"
--> 780         result = self._call(*args, **kwds)
    781 
    782       new_tracing_count = self._get_tracing_count()


/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    812       # In this case we have not created variables on the first call. So we can
    813       # run the first trace but we should fail if variables are created.
--> 814       results = self._stateful_fn(*args, **kwds)
    815       if self._created_variables:
    816         raise ValueError("Creating variables on a non-first call to a function"


/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
   2827     with self._lock:
   2828       graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829     return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
   2830 
   2831   @property


/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager)
   1846                            resource_variable_ops.BaseResourceVariable))],
   1847         captured_inputs=self.captured_inputs,
-> 1848         cancellation_manager=cancellation_manager)
   1849 
   1850   def _call_flat(self, args, captured_inputs, cancellation_manager=None):


/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1922       # No tape is watching; skip to running the function.
   1923       return self._build_call_outputs(self._inference_function.call(
-> 1924           ctx, args, cancellation_manager=cancellation_manager))
   1925     forward_backward = self._select_forward_and_backward_functions(
   1926         args,


/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
    548               inputs=args,
    549               attrs=attrs,
--> 550               ctx=ctx)
    551         else:
    552           outputs = execute.execute_with_cancellation(


/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     58     ctx.ensure_initialized()
     59     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60                                         inputs, attrs, num_outputs)
     61   except core._NotOkStatusException as e:
     62     if name is not None:


InvalidArgumentError:  Matrix size-incompatible: In[0]: [32,1], In[1]: [93,128]
     [[node sequential_4/dense_21/MatMul (defined at <ipython-input-47-872a2584594e>:1) ]] [Op:__inference_predict_function_247180]

Function call stack:
predict_function
submission = test.copy()
submission['speed'] = pred
submission = submission[['id', 'speed']]
submission
id speed
0 0 48.946854
1 1 47.451572
2 2 37.875401
3 3 30.919249
4 4 36.527435
... ... ...
3499 3499 11.122622
3500 3500 22.704823
3501 3501 44.340752
3502 3502 36.847321
3503 3503 41.491169

3504 rows × 2 columns

# Write submission to file

submission.to_csv('submission_20720230.csv', index=False)