Finding correlation in weather data using pandas
A quick glance on how to use pandas
- import libraries
- load hourly data
- create new column by encoding the weathersit one
- define a function for creating a regression plot for a specified weather condition
- create function for computing pearson correlation
- print correlations for temp, atemp, hum and windspeed columns with registered and casual rides
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
%matplotlib inline
data = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Analysis-Workshop/master/Chapter01/data/hour.csv')
data.head()
#collapse-show
weather_mapping = {1: 'clear', 2: 'cloudy', \
3: 'light_rain_snow', 4: 'heavy_rain_snow'}
data['weather'] = data['weathersit']\
.apply(lambda x: weather_mapping[x])
def create_regression_plot(data, col, weather_cond):
# extract data for the specific weather condition
plot_data = data[data['weather'] == weather_cond]
# create regplot for registered users
ax = sns.regplot(x=col, y="registered", data=plot_data, \
scatter_kws={"alpha":0.05})
# create regplot for casual users
ax = sns.regplot(x=col, y="casual", data=plot_data, \
scatter_kws={"alpha":0.05})
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title(f"{col} | {weather_cond}")
return ax
weather_conditions = data.weather.unique()
columns = ["temp", "atemp", "hum", "windspeed"]
plt.figure(figsize=(20,30))
for col_index, col in enumerate(columns):
for row_index, weather_cond in enumerate(weather_conditions):
plot_number = row_index + col_index*4 + 1
plt.subplot(4,4,plot_number)
create_regression_plot(data, col, weather_cond)
def print_correlations(data, col, weather_cond):
# extract data for the specific weather condition
corr_data = data[data['weather'] == weather_cond]
# compute pearson correlation between col and registered rides
pearson_corr_r = pearsonr(corr_data[col], corr_data["registered"])
# compute pearson correlation between col and registered rides
pearson_corr_c = pearsonr(corr_data[col], corr_data["casual"])
# print correlations
print(f"Pearson correlation (registered, {col}): corr={pearson_corr_r[0]:.03f}, pval={pearson_corr_r[1]:.03f}")
print(f"Pearson correlation (casual, {col}): corr={pearson_corr_c[0]:.03f}, pval={pearson_corr_c[1]:.03f}")
weather_conditions = data.weather.unique()
columns = ["temp", "atemp", "hum", "windspeed"]
for col in columns:
for weather_cond in weather_conditions:
print_correlations(data, col, weather_cond)