# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
df = pd.read_csv('2017-fordgobike-tripdata.csv')
df.info()
df.dropna(inplace = True)
df.info()
df['start_time'] = df['start_time'].astype('datetime64[ns]')
df['end_time'] = df['end_time'].astype('datetime64[ns]')
df['user_type'] = df['user_type'].astype('category')
df['member_gender'] = df['member_gender'].astype('category')
df.info()
# i added the bin edges so that the visual looks good
bin_edges = np.arange(df['member_birth_year'].min(), df['member_birth_year'].max() + 1, 1)
plt.hist(data = df, x ='member_birth_year',bins = bin_edges);
plt.title('The Rider Birth Year');
plt.xlabel('Year of Birth');
plt.ylabel('Counts');
bin_edges = 10 ** np.arange(1.785, 4.936 + 0.1, 0.1)
ticks = [20,100,200,1000,4000,10000,20000]
labls = ['{}'.format(v) for v in ticks]
plt.hist(data = df, x ='duration_sec', bins = bin_edges);
plt.xscale("log");
plt.xticks(ticks,labls);
plt.title('The Duration in seconds');
plt.xlabel('Duration in Seconds');
plt.ylabel('Counts');
# i used the color_palettle()[0] to moke the color the original
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'member_gender', color = base_color);
plt.title('The Rider Gender');
plt.xlabel('Gender of Rider');
# here i user the "df.query("member_birth_year >= 1950 ")" to Determination of the y axis
sb.violinplot(data = df.query("member_birth_year >= 1950 ")
,x = 'member_gender',y='member_birth_year',color = sb.color_palette()[0]);
plt.title('The Rider Genders and Year of Birth');
plt.xlabel('Gender of Rider');
plt.ylabel('Year of Birth');
q1 = df['duration_sec'].quantile(0.25)
q3 = df['duration_sec'].quantile(0.75)
qr = q3 - q1
qr
q3
q1
# i used the first and the third quantile here
sb.violinplot(data = df.query('duration_sec <= 838 + 1.5*474'),
x = 'user_type',y='duration_sec',color = sb.color_palette()[0]);
plt.title('The Rider Type and Duration in Seconds');
plt.xlabel('User Type');
plt.ylabel('Duration in Seconds');
sb.violinplot(data = df.query('duration_sec <= 838 + 1.5*474'),
x = 'member_gender',y='duration_sec',color = sb.color_palette()[0]);
plt.title('The Rider Gender and Duration in Seconds');
plt.xlabel('Gender of Rider');
plt.ylabel('Duration in Seconds');
as you can see in the first graph i compared the birth year and the gender and it looks like The females are the highest Between 1980 and 1990 they are more than the others
and in the second graph i compared the Duration with the User type and we get that The Coustomers takes more time than the Subscribers
x_bin = np.arange( df['member_birth_year'].min(), df['member_birth_year'].max()+5, 5)
y_bin = np.arange(0, (838 + 1.5*474)+500, 500)
_temp = df.sample(300000, random_state=42)
m = sb.FacetGrid(data = _temp, col = 'user_type',height=5.4)
m.map(plt.hist2d, 'member_birth_year','duration_sec',cmin=0.5,cmap = 'viridis_r',bins=[x_bin, y_bin]);
plt.ylim((0,(838 + 1.5*474)));
plt.colorbar();
plt.xlabel('Year of Birth');
plt.ylabel('Duration in Seconds');
x_bin = np.arange( df['member_birth_year'].min(), df['member_birth_year'].max()+5, 5)
y_bin = np.arange(0, (838 + 1.5*474)+500, 500)
_temp = df.sample(300000, random_state=42)
m = sb.FacetGrid(data = _temp, col = 'member_gender',height=5.4)
m.map(plt.hist2d, 'member_birth_year','duration_sec',cmin=0.5,cmap = 'viridis_r',bins=[x_bin, y_bin]);
plt.ylim((0,(838 + 1.5*474)));
plt.colorbar();
cat_means = df.groupby(['member_gender', 'user_type']).mean()['duration_sec']
cat_means = cat_means.reset_index(name = 'Duration_avg')
cat_means = cat_means.pivot(index = 'user_type', columns = 'member_gender',
values = 'Duration_avg')
sb.heatmap(cat_means, annot = True, fmt = '.3f',
cbar_kws = {'label' : 'mean(Duration_avg)'});