Speed Prediction¶
In this project, we will use weather data to predict average speed.
-
we will use weather data from openweathermap.org which provides temperature, wind, humidity, and weather conditions
-
This notebook will be running on Google Colab
Some of the data preprocessing techniques are based on the online notebooks from tensorflow org.
https://www.tensorflow.org/tutorials/structured_data/time_series
Install and import dependencies¶
!pip install pactools
Collecting pactools
[?25l Downloading https://files.pythonhosted.org/packages/17/14/4c4eba6e54408e536be27b9891cea68ea391d7d190936593aa71e5c6405e/pactools-0.3.1-py3-none-any.whl (82kB)
[K |████████████████████████████████| 92kB 3.8MB/s
[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from pactools) (3.2.2)
Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from pactools) (2.10.0)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from pactools) (0.22.2.post1)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from pactools) (1.18.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from pactools) (1.4.1)
Collecting mne
[?25l Downloading https://files.pythonhosted.org/packages/4d/0e/6448521738d3357c205795fd5846d023bd7935bb83ba93a1ba0f7124205e/mne-0.21.2-py3-none-any.whl (6.8MB)
[K |████████████████████████████████| 6.8MB 8.5MB/s
[?25hRequirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (2.8.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (2.4.7)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->pactools) (1.3.1)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from h5py->pactools) (1.15.0)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->pactools) (0.17.0)
Installing collected packages: mne, pactools
Successfully installed mne-0.21.2 pactools-0.3.1
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from pactools.grid_search import GridSearchCVProgressBar
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
/usr/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
return f(*args, **kwds)
/usr/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
return f(*args, **kwds)
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
Num GPUs Available: 1
Preprocess¶
Asign Holiday to the table¶
def time_desc(x: datetime):
Early_Morning = [4,5,6,7]
Morning = [8,9,10,11]
Afternoon = [12,13,14,15]
Evening = [16,17,18,19]
Night = [20,21,22,23]
Late_Night = [0,1,2,3]
if x.hour in Early_Morning:
return 'early'
elif x.hour in Morning:
return 'morning'
elif x.hour in Afternoon:
return 'noon'
elif x.hour in Evening:
return 'afternoon'
elif x.hour in Night:
return 'night'
else:
return 'latenight'
def add_holiday_and_weekend(df: pd.DataFrame) -> pd.DataFrame:
"""
Add holiday and weekend to the dataset
"""
new_df = df.copy()
new_df['IsWeekend'] = new_df['date'].apply(lambda x:0 if x.weekday() in [0,1,2,3,4] else 1)
new_df['IsHoliday']=new_df['date'].apply(lambda x:1 if (x.date().strftime('%Y-%m-%d') in [
'2017-01-02','2017-01-28','2017-01-30','2017-01-31','2017-04-04',
'2017-04-14','2017-04-15','2017-04-17','2017-05-01','2017-05-03',
'2017-05-30','2017-07-01','2017-10-02','2017-10-05','2017-10-28',
'2017-12-25','2017-12-26',
'2018-01-01','2018-02-16','2018-02-17','2018-02-19','2018-03-30',
'2018-03-31','2018-04-02','2018-04-05','2018-05-01','2018-05-22',
'2018-06-18','2018-07-02','2018-09-25','2018-10-01','2018-10-17',
'2018-12-25','2018-12-26'
])
or(x.weekday() in[6]) else 0)
return new_df
I am using google drive to store all the data including the weather data. So please change this line to the your file paths.
# change following two lines to your paths
train = pd.read_csv('/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/test.csv')
train = train.drop('id', axis=1)
train['date'] = pd.to_datetime(train['date'])
train = add_holiday_and_weekend(train)
train['time desc']=train['date'].apply(time_desc)
train
| date | speed | IsWeekend | IsHoliday | time desc | |
|---|---|---|---|---|---|
| 0 | 2017-01-01 00:00:00 | 43.002930 | 1 | 1 | latenight |
| 1 | 2017-01-01 01:00:00 | 46.118696 | 1 | 1 | latenight |
| 2 | 2017-01-01 02:00:00 | 44.294158 | 1 | 1 | latenight |
| 3 | 2017-01-01 03:00:00 | 41.067468 | 1 | 1 | latenight |
| 4 | 2017-01-01 04:00:00 | 46.448653 | 1 | 1 | early |
| ... | ... | ... | ... | ... | ... |
| 14001 | 2018-12-31 12:00:00 | 19.865269 | 0 | 0 | noon |
| 14002 | 2018-12-31 15:00:00 | 17.820375 | 0 | 0 | noon |
| 14003 | 2018-12-31 16:00:00 | 12.501851 | 0 | 0 | afternoon |
| 14004 | 2018-12-31 18:00:00 | 15.979319 | 0 | 0 | afternoon |
| 14005 | 2018-12-31 20:00:00 | 40.594183 | 0 | 0 | night |
14006 rows × 5 columns
train.plot(x='date', y='speed', figsize=(20, 10))
<matplotlib.axes._subplots.AxesSubplot at 0x7f26b0215400>
Merge weather¶
Pre-process weather data¶
from datetime import datetime
def k_to_c(x):
return x - 273.15
we have two different weather sources.
- Weather from open weather (Provides hourly basis weather report)
- Weather from Hong Kong observatory (Provides daily basis weather report)
Based on the experiments, I decide to use Hong Kong observatory's weather report which will bring a higher accuracy
# Change this path to yours
# weather = pd.read_csv("/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/hongkong_weather_1970-2020.csv")
# weather['dt_iso'] = weather['dt_iso'].apply(lambda x: x.replace('UTC', ''))
# weather['date'] = pd.to_datetime(weather['dt_iso']).dt.tz_convert("Asia/Hong_Kong").dt.tz_localize(None)
# # Transform unit
# weather['temp'] = weather['temp'].apply(k_to_c)
# weather['feels_like'] = weather['feels_like'].apply(k_to_c)
# weather['temp_min'] = weather['temp_min'].apply(k_to_c)
# weather['temp_max'] = weather['temp_max'].apply(k_to_c)
# weather = weather.drop(["dt_iso", "dt", "weather_icon", "rain_1h", "rain_3h", "snow_1h", "snow_3h", "sea_level", "grnd_level", "timezone", "lat", "lon"], axis=1)
# mask = (weather['date'] >= datetime(2017,1,1)) & (weather['date'] <= datetime(2019, 1, 1))
# weather = weather.loc[mask]
# weather
weather=pd.read_csv("/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/wr.csv")
weather['date'] = pd.to_datetime(weather.year*10000+weather.month*100+weather.day,format='%Y%m%d')
weather.drop(['Unnamed: 0', 'year', 'month', 'day'],axis=1,inplace=True)
weather['TRainfall(mm)']=weather['TRainfall(mm)'].apply(lambda x:0 if x in ['-','Trace'] else x)
weather.head()
| MPressure(hPa) | MaxTemp(℃) | MinTemp(℃) | MCloud(%) | TRainfall(mm) | #hRedVisi(h) | WindDirect(degrees) | MWindSpeed(km/h) | date | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1021.7 | 20.8 | 18.4 | 72 | 0 | 0 | 60 | 34.2 | 2017-01-01 |
| 1 | 1020.2 | 23.3 | 18.4 | 28 | 0 | 0 | 70 | 17.7 | 2017-01-02 |
| 2 | 1019.8 | 21.3 | 18.9 | 56 | 0 | 5 | 70 | 26.1 | 2017-01-03 |
| 3 | 1018.7 | 21.7 | 18.7 | 51 | 0 | 0 | 70 | 27.7 | 2017-01-04 |
| 4 | 1016.9 | 23.4 | 18.9 | 61 | 0 | 0 | 40 | 14.3 | 2017-01-05 |
Merge¶
from pandas import DatetimeIndex
def merge_weather(df: pd.DataFrame, weather: pd.DataFrame, how='left') -> pd.DataFrame:
'''
Merge weather with data.
'''
new_df = df.copy()
new_weather = weather.copy()
# new_df['tmp_date'] = new_df['date']
# new_weather['tmp_date'] = new_weather['date']
new_df['tmp_date'] = new_df['date'].apply(lambda date: date.date())
new_weather['tmp_date'] = new_weather['date'].apply(lambda date: date.date())
new_training_data = new_df.merge(new_weather, on='tmp_date', how=how)
new_training_data = new_training_data.drop(['tmp_date', 'date_y'], axis=1)
new_training_data = new_training_data.rename(columns={'date_x': 'date'})
new_training_data['hour'] = DatetimeIndex(new_training_data['date']).hour
new_training_data['day'] = DatetimeIndex(new_training_data['date']).day
new_training_data['month'] = DatetimeIndex(new_training_data['date']).month
new_training_data['year'] = DatetimeIndex(new_training_data['date']).year
new_training_data['weekday'] = DatetimeIndex(new_training_data['date']).weekday
return new_training_data
new_training_data = merge_weather(train, weather)
new_training_data
| date | speed | IsWeekend | IsHoliday | time desc | MPressure(hPa) | MaxTemp(℃) | MinTemp(℃) | MCloud(%) | TRainfall(mm) | #hRedVisi(h) | WindDirect(degrees) | MWindSpeed(km/h) | hour | day | month | year | weekday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 00:00:00 | 43.002930 | 1 | 1 | latenight | 1021.7 | 20.8 | 18.4 | 72 | 0 | 0 | 60 | 34.2 | 0 | 1 | 1 | 2017 | 6 |
| 1 | 2017-01-01 01:00:00 | 46.118696 | 1 | 1 | latenight | 1021.7 | 20.8 | 18.4 | 72 | 0 | 0 | 60 | 34.2 | 1 | 1 | 1 | 2017 | 6 |
| 2 | 2017-01-01 02:00:00 | 44.294158 | 1 | 1 | latenight | 1021.7 | 20.8 | 18.4 | 72 | 0 | 0 | 60 | 34.2 | 2 | 1 | 1 | 2017 | 6 |
| 3 | 2017-01-01 03:00:00 | 41.067468 | 1 | 1 | latenight | 1021.7 | 20.8 | 18.4 | 72 | 0 | 0 | 60 | 34.2 | 3 | 1 | 1 | 2017 | 6 |
| 4 | 2017-01-01 04:00:00 | 46.448653 | 1 | 1 | early | 1021.7 | 20.8 | 18.4 | 72 | 0 | 0 | 60 | 34.2 | 4 | 1 | 1 | 2017 | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14001 | 2018-12-31 12:00:00 | 19.865269 | 0 | 0 | noon | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 12 | 31 | 12 | 2018 | 0 |
| 14002 | 2018-12-31 15:00:00 | 17.820375 | 0 | 0 | noon | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 15 | 31 | 12 | 2018 | 0 |
| 14003 | 2018-12-31 16:00:00 | 12.501851 | 0 | 0 | afternoon | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 16 | 31 | 12 | 2018 | 0 |
| 14004 | 2018-12-31 18:00:00 | 15.979319 | 0 | 0 | afternoon | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 18 | 31 | 12 | 2018 | 0 |
| 14005 | 2018-12-31 20:00:00 | 40.594183 | 0 | 0 | night | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 20 | 31 | 12 | 2018 | 0 |
14006 rows × 18 columns
Plot data¶
plt.figure(figsize=(6,4))
sns.boxplot('speed',data=new_training_data,orient='h',palette="Set3",linewidth=2.5)
plt.show()
Traffic speed¶
data_plot = new_training_data
data_plot['month'] = data_plot['date'].dt.month
data_plot
tmp_data=new_training_data.groupby('month').aggregate({'speed':'mean'})
plt.figure(figsize=(8,6))
sns.lineplot(x=tmp_data.index,y=tmp_data.speed,data=tmp_data,palette="Set2")
plt.show()
plt.figure(figsize=(8,6))
sns.countplot(y='time desc',data=new_training_data,palette=["#7fcdbb","#edf8b1","#fc9272","#fee0d2","#bcbddc","#efedf5"])
plt.show()
new_training_data.hist(bins=50,figsize=(20,15))
plt.show()
Train¶
new_training_data.columns
Index(['date', 'speed', 'IsWeekend', 'IsHoliday', 'time desc',
'MPressure(hPa)', 'MaxTemp(℃)', 'MinTemp(℃)', 'MCloud(%)',
'TRainfall(mm)', '#hRedVisi(h)', 'WindDirect(degrees)',
'MWindSpeed(km/h)', 'hour', 'day', 'month', 'year', 'weekday'],
dtype='object')
# using open weather dataset
# cat_vars=['IsWeekend','IsHoliday','time desc']
# num_vars=['temp', 'pressure', 'wind_speed', 'humidity', "clouds_all", 'month', 'year', 'day', 'hour']
#using hong kong government dataset
cat_vars=['IsWeekend','IsHoliday','time desc', "weekday", "day", 'month', 'year']
num_vars=['MaxTemp(℃)','MinTemp(℃)','MCloud(%)',
'TRainfall(mm)','#hRedVisi(h)','WindDirect(degrees)',
'MWindSpeed(km/h)', 'hour']
Transform Data¶
numeric_transformer=Pipeline(steps=[
('scaler', MinMaxScaler())])
categorical_transformer=Pipeline(steps=[
('oneHot',OneHotEncoder(sparse=False))])
preprocessor=ColumnTransformer(transformers=[
('num',numeric_transformer,num_vars),
('cat',categorical_transformer,cat_vars)])
data_transformed=preprocessor.fit_transform(new_training_data)
print(data_transformed.shape)
(14006, 70)
y = new_training_data['speed']
# y = y.to_numpy()
# y = y.reshape(-1, 1)
# print(y.shape)
# print(y)
We want to scale the speed
import numpy as np
Split data¶
X_train,X_test,y_train,y_test=train_test_split(data_transformed,y,test_size=0.15,random_state=42)
print(X_train)
print(y_train)
print(f"Train x shape: {X_train.shape}")
print(f"Train y shape: {y_train.shape}")
[[0.36923077 0.42608696 0.70833333 ... 0. 0. 1. ]
[0.83076923 0.90434783 0.78125 ... 0. 1. 0. ]
[0.84230769 0.95652174 0.5625 ... 0. 1. 0. ]
...
[0.84230769 0.95652174 0.5625 ... 0. 1. 0. ]
[0.76923077 0.94782609 0.875 ... 0. 1. 0. ]
[0.55384615 0.53043478 0.48958333 ... 0. 1. 0. ]]
9443 46.813428
4822 15.018334
5391 48.414942
11099 19.663880
7854 10.389012
...
5191 24.637279
13418 35.306286
5390 46.950711
860 46.623319
7270 16.718626
Name: speed, Length: 11905, dtype: float64
Train x shape: (11905, 70)
Train y shape: (11905,)
Training by using XGBRegression¶
tscv = TimeSeriesSplit(n_splits=3)
model = XGBRegressor()
param_grid = {'nthread':[4,6,8],
'objective':['reg:squarederror'],
'learning_rate':[.03, 0.05, .07],
'max_depth':[5, 6, 7],
'min_child_weight':[4],
'subsample':[0.7],
'colsample_bytree':[0.7],
'n_estimators':[500]}
xgb=GridSearchCV(estimator=model,param_grid=param_grid,cv=tscv,n_jobs=4, verbose=10)
xgb.fit(X_train,y_train)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-54-e4f74d5b703b> in <module>()
12
13 xgb=GridSearchCV(estimator=model,param_grid=param_grid,cv=tscv,n_jobs=4, verbose=10)
---> 14 xgb.fit(X_train,y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
708 return results
709
--> 710 self._run_search(evaluate_candidates)
711
712 # For multi-metric evaluation, store the best_index_, best_params_ and
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1149 def _run_search(self, evaluate_candidates):
1150 """Search all candidates in param_grid"""
-> 1151 evaluate_candidates(ParameterGrid(self.param_grid))
1152
1153
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
687 for parameters, (train, test)
688 in product(candidate_params,
--> 689 cv.split(X, y, groups)))
690
691 if len(out) < 1:
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
1059
1060 with self._backend.retrieval_context():
-> 1061 self.retrieve()
1062 # Make sure that we get a last message telling us we are done
1063 elapsed_time = time.time() - self._start_time
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in retrieve(self)
938 try:
939 if getattr(self._backend, 'supports_timeout', False):
--> 940 self._output.extend(job.get(timeout=self.timeout))
941 else:
942 self._output.extend(job.get())
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
/usr/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
425 return self.__get_result()
426
--> 427 self._condition.wait(timeout)
428
429 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
/usr/lib/python3.6/threading.py in wait(self, timeout)
293 try: # restore state no matter what (e.g., KeyboardInterrupt)
294 if timeout is None:
--> 295 waiter.acquire()
296 gotit = True
297 else:
KeyboardInterrupt:
Training by using DNN¶
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras import regularizers
print(X_train.shape)
(11905, 70)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
model = keras.Sequential(
[
layers.Dense(128, activation="relu",
input_shape=(X_train.shape[1],),),
layers.Dense(128, activation="relu"),
layers.Dropout(0.2),
layers.Dense(64, activation="relu"),
layers.Dense(1),
]
)
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='mean_squared_error',
optimizer=opt, metrics=['mae', 'mse'])
history = model.fit(X_train, y_train, epochs=200, validation_split=0.2, callbacks=[early_stop],)
Epoch 1/200
298/298 [==============================] - 1s 4ms/step - loss: 169.2792 - mae: 9.2261 - mse: 169.2792 - val_loss: 49.0226 - val_mae: 5.3678 - val_mse: 49.0226
Epoch 2/200
298/298 [==============================] - 1s 3ms/step - loss: 53.3526 - mae: 5.5417 - mse: 53.3526 - val_loss: 45.0271 - val_mae: 5.0866 - val_mse: 45.0271
Epoch 3/200
298/298 [==============================] - 1s 3ms/step - loss: 49.2738 - mae: 5.3108 - mse: 49.2738 - val_loss: 43.7640 - val_mae: 5.1142 - val_mse: 43.7640
Epoch 4/200
298/298 [==============================] - 1s 4ms/step - loss: 46.0532 - mae: 5.1287 - mse: 46.0532 - val_loss: 40.5389 - val_mae: 4.7757 - val_mse: 40.5389
Epoch 5/200
298/298 [==============================] - 1s 3ms/step - loss: 42.8026 - mae: 4.9785 - mse: 42.8026 - val_loss: 40.7752 - val_mae: 4.9557 - val_mse: 40.7752
Epoch 6/200
298/298 [==============================] - 1s 4ms/step - loss: 40.8385 - mae: 4.9025 - mse: 40.8385 - val_loss: 38.6987 - val_mae: 4.7153 - val_mse: 38.6987
Epoch 7/200
298/298 [==============================] - 1s 3ms/step - loss: 38.1873 - mae: 4.7325 - mse: 38.1873 - val_loss: 38.2969 - val_mae: 4.6748 - val_mse: 38.2969
Epoch 8/200
298/298 [==============================] - 1s 4ms/step - loss: 37.1973 - mae: 4.6986 - mse: 37.1973 - val_loss: 37.6423 - val_mae: 4.6165 - val_mse: 37.6423
Epoch 9/200
298/298 [==============================] - 1s 4ms/step - loss: 35.9464 - mae: 4.6294 - mse: 35.9464 - val_loss: 39.2058 - val_mae: 4.9141 - val_mse: 39.2058
Epoch 10/200
298/298 [==============================] - 1s 3ms/step - loss: 34.3299 - mae: 4.5449 - mse: 34.3299 - val_loss: 37.6090 - val_mae: 4.7696 - val_mse: 37.6090
Epoch 11/200
298/298 [==============================] - 1s 3ms/step - loss: 32.5047 - mae: 4.4394 - mse: 32.5047 - val_loss: 37.4612 - val_mae: 4.8729 - val_mse: 37.4612
Epoch 12/200
298/298 [==============================] - 1s 4ms/step - loss: 32.2329 - mae: 4.4059 - mse: 32.2329 - val_loss: 41.9161 - val_mae: 5.2683 - val_mse: 41.9161
Epoch 13/200
298/298 [==============================] - 1s 4ms/step - loss: 31.2041 - mae: 4.3138 - mse: 31.2041 - val_loss: 35.0004 - val_mae: 4.6513 - val_mse: 35.0004
Epoch 14/200
298/298 [==============================] - 1s 3ms/step - loss: 30.4892 - mae: 4.2764 - mse: 30.4892 - val_loss: 33.7725 - val_mae: 4.4478 - val_mse: 33.7725
Epoch 15/200
298/298 [==============================] - 1s 4ms/step - loss: 29.2411 - mae: 4.1746 - mse: 29.2411 - val_loss: 32.9828 - val_mae: 4.2607 - val_mse: 32.9828
Epoch 16/200
298/298 [==============================] - 1s 4ms/step - loss: 28.5754 - mae: 4.1273 - mse: 28.5754 - val_loss: 32.2138 - val_mae: 4.2323 - val_mse: 32.2138
Epoch 17/200
298/298 [==============================] - 1s 4ms/step - loss: 27.3392 - mae: 4.0219 - mse: 27.3392 - val_loss: 32.4703 - val_mae: 4.4164 - val_mse: 32.4703
Epoch 18/200
298/298 [==============================] - 1s 3ms/step - loss: 27.5931 - mae: 4.0547 - mse: 27.5931 - val_loss: 31.8828 - val_mae: 4.2749 - val_mse: 31.8828
Epoch 19/200
298/298 [==============================] - 1s 3ms/step - loss: 26.7623 - mae: 3.9968 - mse: 26.7623 - val_loss: 32.7509 - val_mae: 4.3703 - val_mse: 32.7509
Epoch 20/200
298/298 [==============================] - 1s 4ms/step - loss: 26.7770 - mae: 3.9945 - mse: 26.7770 - val_loss: 31.6045 - val_mae: 4.2057 - val_mse: 31.6045
Epoch 21/200
298/298 [==============================] - 1s 3ms/step - loss: 25.9685 - mae: 3.9541 - mse: 25.9685 - val_loss: 31.0973 - val_mae: 4.1812 - val_mse: 31.0973
Epoch 22/200
298/298 [==============================] - 1s 4ms/step - loss: 26.3720 - mae: 3.9701 - mse: 26.3720 - val_loss: 31.9482 - val_mae: 4.2127 - val_mse: 31.9482
Epoch 23/200
298/298 [==============================] - 1s 3ms/step - loss: 25.3420 - mae: 3.9054 - mse: 25.3420 - val_loss: 34.5661 - val_mae: 4.6946 - val_mse: 34.5661
Epoch 24/200
298/298 [==============================] - 1s 4ms/step - loss: 25.2177 - mae: 3.8832 - mse: 25.2177 - val_loss: 31.2380 - val_mae: 4.1930 - val_mse: 31.2380
Epoch 25/200
298/298 [==============================] - 1s 4ms/step - loss: 24.5446 - mae: 3.8306 - mse: 24.5446 - val_loss: 31.1003 - val_mae: 4.2247 - val_mse: 31.1003
Epoch 26/200
298/298 [==============================] - 1s 4ms/step - loss: 24.4506 - mae: 3.8159 - mse: 24.4506 - val_loss: 31.3635 - val_mae: 4.3589 - val_mse: 31.3635
Epoch 27/200
298/298 [==============================] - 1s 4ms/step - loss: 24.0837 - mae: 3.7934 - mse: 24.0837 - val_loss: 30.2161 - val_mae: 4.2576 - val_mse: 30.2161
Epoch 28/200
298/298 [==============================] - 1s 3ms/step - loss: 23.7172 - mae: 3.7904 - mse: 23.7172 - val_loss: 30.6251 - val_mae: 4.1824 - val_mse: 30.6251
Epoch 29/200
298/298 [==============================] - 1s 3ms/step - loss: 23.0786 - mae: 3.7103 - mse: 23.0786 - val_loss: 30.5365 - val_mae: 4.2229 - val_mse: 30.5365
Epoch 30/200
298/298 [==============================] - 1s 3ms/step - loss: 23.7149 - mae: 3.7750 - mse: 23.7149 - val_loss: 30.7331 - val_mae: 4.1697 - val_mse: 30.7331
Epoch 31/200
298/298 [==============================] - 1s 3ms/step - loss: 22.5444 - mae: 3.6918 - mse: 22.5444 - val_loss: 29.5841 - val_mae: 4.1091 - val_mse: 29.5841
Epoch 32/200
298/298 [==============================] - 1s 4ms/step - loss: 22.6646 - mae: 3.6896 - mse: 22.6646 - val_loss: 31.0110 - val_mae: 4.2368 - val_mse: 31.0110
Epoch 33/200
298/298 [==============================] - 1s 3ms/step - loss: 22.2019 - mae: 3.6553 - mse: 22.2019 - val_loss: 30.2204 - val_mae: 4.1981 - val_mse: 30.2204
Epoch 34/200
298/298 [==============================] - 1s 3ms/step - loss: 22.1869 - mae: 3.6523 - mse: 22.1869 - val_loss: 29.4552 - val_mae: 4.0781 - val_mse: 29.4552
Epoch 35/200
298/298 [==============================] - 1s 3ms/step - loss: 21.6420 - mae: 3.6126 - mse: 21.6420 - val_loss: 28.8039 - val_mae: 4.0706 - val_mse: 28.8039
Epoch 36/200
298/298 [==============================] - 1s 4ms/step - loss: 21.1637 - mae: 3.5743 - mse: 21.1637 - val_loss: 29.7665 - val_mae: 4.1830 - val_mse: 29.7665
Epoch 37/200
298/298 [==============================] - 1s 4ms/step - loss: 21.5559 - mae: 3.6128 - mse: 21.5559 - val_loss: 29.8333 - val_mae: 4.0771 - val_mse: 29.8333
Epoch 38/200
298/298 [==============================] - 1s 4ms/step - loss: 20.9312 - mae: 3.5428 - mse: 20.9312 - val_loss: 33.7653 - val_mae: 4.3892 - val_mse: 33.7653
Epoch 39/200
298/298 [==============================] - 1s 4ms/step - loss: 20.9336 - mae: 3.5552 - mse: 20.9336 - val_loss: 29.2746 - val_mae: 4.1145 - val_mse: 29.2746
Epoch 40/200
298/298 [==============================] - 1s 4ms/step - loss: 20.7350 - mae: 3.5443 - mse: 20.7350 - val_loss: 30.5803 - val_mae: 4.2725 - val_mse: 30.5803
Epoch 41/200
298/298 [==============================] - 1s 3ms/step - loss: 20.3824 - mae: 3.5076 - mse: 20.3824 - val_loss: 29.8183 - val_mae: 4.1737 - val_mse: 29.8183
Epoch 42/200
298/298 [==============================] - 1s 4ms/step - loss: 20.4556 - mae: 3.5163 - mse: 20.4556 - val_loss: 29.3429 - val_mae: 4.0838 - val_mse: 29.3429
Epoch 43/200
298/298 [==============================] - 1s 3ms/step - loss: 20.0906 - mae: 3.4789 - mse: 20.0906 - val_loss: 30.3515 - val_mae: 4.1084 - val_mse: 30.3515
Epoch 44/200
298/298 [==============================] - 1s 3ms/step - loss: 19.9412 - mae: 3.4736 - mse: 19.9412 - val_loss: 30.2394 - val_mae: 4.2345 - val_mse: 30.2394
Epoch 45/200
298/298 [==============================] - 1s 3ms/step - loss: 19.6319 - mae: 3.4261 - mse: 19.6319 - val_loss: 30.2833 - val_mae: 4.1500 - val_mse: 30.2833
plot
def plot_history(history):
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Abs Error [MPG]')
plt.plot(hist['epoch'], hist['mae'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mae'],
label = 'Val Error')
plt.legend()
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Square Error [$MPG^2$]')
plt.plot(hist['epoch'], hist['mse'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mse'],
label = 'Val Error')
plt.legend()
plt.show()
plot_history(history)
Training by using Decision Tree¶
from sklearn import tree
# tree_clf = tree.DecisionTreeRegressor()
# tree_clf.fit(X_train, y_train)
tscv = TimeSeriesSplit(n_splits=3)
param_grid = {
'max_depth': range(1, 10),
'min_samples_split': range(1, 10),
'min_samples_leaf': range(1, 5)
}
tree_model = tree.DecisionTreeRegressor()
tree_reg = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=tscv, n_jobs=4, verbose=10)
tree_reg.fit(X_train, y_train)
Fitting 3 folds for each of 324 candidates, totalling 972 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 5 tasks | elapsed: 1.2s
[Parallel(n_jobs=4)]: Done 10 tasks | elapsed: 1.2s
[Parallel(n_jobs=4)]: Done 17 tasks | elapsed: 1.3s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1742s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done 24 tasks | elapsed: 1.3s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0517s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done 44 tasks | elapsed: 1.4s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0866s.) Setting batch_size=8.
[Parallel(n_jobs=4)]: Batch computation too fast (0.1193s.) Setting batch_size=16.
[Parallel(n_jobs=4)]: Done 88 tasks | elapsed: 1.6s
[Parallel(n_jobs=4)]: Done 216 tasks | elapsed: 2.3s
[Parallel(n_jobs=4)]: Done 392 tasks | elapsed: 3.3s
[Parallel(n_jobs=4)]: Done 600 tasks | elapsed: 5.4s
[Parallel(n_jobs=4)]: Done 808 tasks | elapsed: 7.7s
[Parallel(n_jobs=4)]: Done 972 out of 972 | elapsed: 9.7s finished
GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
error_score=nan,
estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
max_depth=None, max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort='deprecated',
random_state=None,
splitter='best'),
iid='deprecated', n_jobs=4,
param_grid={'max_depth': range(1, 10),
'min_samples_leaf': range(1, 5),
'min_samples_split': range(1, 10)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=10)
Training by using Ridge Regression¶
from sklearn import linear_model
param_grid = {'alpha': range(1, 10)}
reg_model = linear_model.Ridge(alpha=.5)
ridge = GridSearchCV(estimator=reg_model, param_grid=param_grid, cv=tscv, n_jobs=4, verbose=10)
ridge.fit(X_train, y_train)
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=4)]: Batch computation too fast (0.0206s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done 5 tasks | elapsed: 0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0476s.) Setting batch_size=4.
[Parallel(n_jobs=4)]: Done 12 tasks | elapsed: 0.1s
[Parallel(n_jobs=4)]: Done 20 out of 27 | elapsed: 0.1s remaining: 0.0s
[Parallel(n_jobs=4)]: Done 23 out of 27 | elapsed: 0.1s remaining: 0.0s
[Parallel(n_jobs=4)]: Done 27 out of 27 | elapsed: 0.1s finished
GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
error_score=nan,
estimator=Ridge(alpha=0.5, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001),
iid='deprecated', n_jobs=4, param_grid={'alpha': range(1, 10)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=10)
Training by using Random forest¶
model_forest = RandomForestRegressor()
param_grid2={'n_estimators':[10,50,100,1000],
'max_features':range(1, 4)
}
random_forest=GridSearchCV(estimator=model_forest,param_grid=param_grid2,cv=tscv,n_jobs=10, verbose=10)
random_forest.fit(X_train,y_train)
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done 5 tasks | elapsed: 1.5s
[Parallel(n_jobs=10)]: Done 12 tasks | elapsed: 4.1s
[Parallel(n_jobs=10)]: Done 21 out of 36 | elapsed: 7.9s remaining: 5.7s
[Parallel(n_jobs=10)]: Done 25 out of 36 | elapsed: 10.6s remaining: 4.7s
[Parallel(n_jobs=10)]: Done 29 out of 36 | elapsed: 23.0s remaining: 5.6s
[Parallel(n_jobs=10)]: Done 33 out of 36 | elapsed: 35.6s remaining: 3.2s
[Parallel(n_jobs=10)]: Done 36 out of 36 | elapsed: 42.2s finished
GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
error_score=nan,
estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
criterion='mse', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False),
iid='deprecated', n_jobs=10,
param_grid={'max_features': range(1, 4),
'n_estimators': [10, 50, 100, 1000]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=10)
Training by using SVM¶
model_svm = svm.SVR()
param_grid3={'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]}
svr=GridSearchCV(estimator=model_svm,n_jobs=10, verbose=10, param_grid=param_grid3)
svr.fit(X_train,y_train, )
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done 5 tasks | elapsed: 48.8s
[Parallel(n_jobs=10)]: Done 12 tasks | elapsed: 1.6min
[Parallel(n_jobs=10)]: Done 21 tasks | elapsed: 2.4min
[Parallel(n_jobs=10)]: Done 26 out of 40 | elapsed: 2.4min remaining: 1.3min
[Parallel(n_jobs=10)]: Done 31 out of 40 | elapsed: 3.2min remaining: 56.1s
[Parallel(n_jobs=10)]: Done 36 out of 40 | elapsed: 3.2min remaining: 21.6s
[Parallel(n_jobs=10)]: Done 40 out of 40 | elapsed: 3.3min finished
GridSearchCV(cv=None, error_score=nan,
estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
epsilon=0.1, gamma='scale', kernel='rbf',
max_iter=-1, shrinking=True, tol=0.001,
verbose=False),
iid='deprecated', n_jobs=10,
param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
'kernel': ['rbf']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=10)
Training by using Neural network¶
from sklearn.neural_network import MLPRegressor
mlp_reg = MLPRegressor(max_iter=1000)
mlp_reg.fit(X_train, y_train)
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_fun=15000, max_iter=1000,
momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
power_t=0.5, random_state=None, shuffle=True, solver='adam',
tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
Evaluation¶
y_pred = model.predict(X_test)
print(y_pred)
print(y_test)
[[35.947376]
[19.731487]
[46.31413 ]
...
[40.15987 ]
[14.448287]
[32.13128 ]]
6405 17.479723
10497 19.555973
3669 48.410243
747 24.168746
902 44.950639
...
11278 19.544291
2312 9.208905
6475 47.599270
6440 7.897116
10103 31.978004
Name: speed, Length: 2101, dtype: float64
# y_pred2 = xgb.predict(X_test)
# y_pred3 = random_forest.predict(X_test)
# y_pred4 = svr.predict(X_test)
# y_pred5 = tree_reg.predict(X_test)
# y_pred6 = ridge.predict(X_test)
# y_pred7 = mlp_reg.predict(X_test)
MSE_DNN=mean_squared_error(y_pred,y_test)
# MSE_rg=mean_squared_error(y_pred2,y_test)
# MSE_rf=mean_squared_error(y_pred3,y_test)
# MSE_tree=mean_squared_error(y_pred5,y_test)
# MSE_svr=mean_squared_error(y_pred4,y_test)
# MSE_ridge = mean_squared_error(y_pred6, y_test)
# MSE_mlp = mean_squared_error(y_pred7, y_test)
print(f"MSE for dnn is: {MSE_DNN}")
# print('MSE for XGBoost is '+str(MSE_rg))
# print('MSE for RandomForest is '+str(MSE_rf))
# print(f"MSE for decision is: {MSE_tree}")
# print(f"MSE for svr is: {MSE_svr}")
# print(f"MSE for ridge is: {MSE_ridge}")
# print(f"MSE for neural network is: {MSE_mlp}")
MSE for dnn is: 28.724601352433957
Based on the observation, we found that XGV will provide the most accurate results.
Make a prediction¶
test = pd.read_csv('/content/drive/MyDrive/courses/HKUST/MSBD5001/project/data/individual project/test.csv')
print(test.to_numpy().shape)
(3504, 2)
test['date'] = pd.to_datetime(test['date'])
test = add_holiday_and_weekend(test)
test['time desc']=test['date'].apply(time_desc)
print(test.shape)
(3504, 5)
new_test = merge_weather(test, weather, how='inner')
new_test = new_test.drop_duplicates(['date'], keep='last')
new_test
| id | date | IsWeekend | IsHoliday | time desc | MPressure(hPa) | MaxTemp(℃) | MinTemp(℃) | MCloud(%) | TRainfall(mm) | #hRedVisi(h) | WindDirect(degrees) | MWindSpeed(km/h) | hour | day | month | year | weekday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2018-01-01 02:00:00 | 0 | 1 | latenight | 1020.5 | 19.0 | 16.3 | 75 | 0 | 21 | 70 | 28.4 | 2 | 1 | 1 | 2018 | 0 |
| 1 | 1 | 2018-01-01 05:00:00 | 0 | 1 | early | 1020.5 | 19.0 | 16.3 | 75 | 0 | 21 | 70 | 28.4 | 5 | 1 | 1 | 2018 | 0 |
| 2 | 2 | 2018-01-01 07:00:00 | 0 | 1 | early | 1020.5 | 19.0 | 16.3 | 75 | 0 | 21 | 70 | 28.4 | 7 | 1 | 1 | 2018 | 0 |
| 3 | 3 | 2018-01-01 08:00:00 | 0 | 1 | morning | 1020.5 | 19.0 | 16.3 | 75 | 0 | 21 | 70 | 28.4 | 8 | 1 | 1 | 2018 | 0 |
| 4 | 4 | 2018-01-01 10:00:00 | 0 | 1 | morning | 1020.5 | 19.0 | 16.3 | 75 | 0 | 21 | 70 | 28.4 | 10 | 1 | 1 | 2018 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3499 | 3499 | 2018-12-31 17:00:00 | 0 | 0 | afternoon | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 17 | 31 | 12 | 2018 | 0 |
| 3500 | 3500 | 2018-12-31 19:00:00 | 0 | 0 | afternoon | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 19 | 31 | 12 | 2018 | 0 |
| 3501 | 3501 | 2018-12-31 21:00:00 | 0 | 0 | night | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 21 | 31 | 12 | 2018 | 0 |
| 3502 | 3502 | 2018-12-31 22:00:00 | 0 | 0 | night | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 22 | 31 | 12 | 2018 | 0 |
| 3503 | 3503 | 2018-12-31 23:00:00 | 0 | 0 | night | 1027.0 | 15.6 | 11.8 | 77 | 0 | 0 | 360 | 26.8 | 23 | 31 | 12 | 2018 | 0 |
3504 rows × 18 columns
transformed_data = preprocessor.fit_transform(new_test)
print(transformed_data.shape)
(1, 322368)
pred = model.predict(transformed_data.reshape(-1, 1))
print(pred.shape)
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-47-872a2584594e> in <module>()
----> 1 pred = model.predict(transformed_data.reshape(-1, 1))
2 print(pred.shape)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
128 raise ValueError('{} is not supported in multi-worker mode.'.format(
129 method.__name__))
--> 130 return method(self, *args, **kwargs)
131
132 return tf_decorator.make_decorator(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in predict(self, x, batch_size, verbose, steps, callbacks, max_queue_size, workers, use_multiprocessing)
1597 for step in data_handler.steps():
1598 callbacks.on_predict_batch_begin(step)
-> 1599 tmp_batch_outputs = predict_function(iterator)
1600 if data_handler.should_sync:
1601 context.async_wait()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
812 # In this case we have not created variables on the first call. So we can
813 # run the first trace but we should fail if variables are created.
--> 814 results = self._stateful_fn(*args, **kwds)
815 if self._created_variables:
816 raise ValueError("Creating variables on a non-first call to a function"
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 @property
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1846 resource_variable_ops.BaseResourceVariable))],
1847 captured_inputs=self.captured_inputs,
-> 1848 cancellation_manager=cancellation_manager)
1849
1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1922 # No tape is watching; skip to running the function.
1923 return self._build_call_outputs(self._inference_function.call(
-> 1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
1926 args,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
548 inputs=args,
549 attrs=attrs,
--> 550 ctx=ctx)
551 else:
552 outputs = execute.execute_with_cancellation(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: Matrix size-incompatible: In[0]: [32,1], In[1]: [93,128]
[[node sequential_4/dense_21/MatMul (defined at <ipython-input-47-872a2584594e>:1) ]] [Op:__inference_predict_function_247180]
Function call stack:
predict_function
submission = test.copy()
submission['speed'] = pred
submission = submission[['id', 'speed']]
submission
| id | speed | |
|---|---|---|
| 0 | 0 | 48.946854 |
| 1 | 1 | 47.451572 |
| 2 | 2 | 37.875401 |
| 3 | 3 | 30.919249 |
| 4 | 4 | 36.527435 |
| ... | ... | ... |
| 3499 | 3499 | 11.122622 |
| 3500 | 3500 | 22.704823 |
| 3501 | 3501 | 44.340752 |
| 3502 | 3502 | 36.847321 |
| 3503 | 3503 | 41.491169 |
3504 rows × 2 columns
# Write submission to file
submission.to_csv('submission_20720230.csv', index=False)