import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style()
from scipy import stats


def adjust(h=5):
    plt.figure(figsize=(16, h), dpi=300)


hour = pd.read_csv('hour.csv')
hour.head()


day = pd.read_csv('day.csv')
day.head()


hour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


hour.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64


day.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


day.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64


print(f"total duplicated file for day dataset {day.duplicated().sum()}")
print(f"total duplicated file for hour dataset {hour.duplicated().sum()}")

total duplicated file for day dataset 0
total duplicated file for hour dataset 0


# I change cnt into y because the target is cnt and I am commitment to change target into y
columns = {'dteday': 'date',
           'yr': 'year',
           'mnth': 'month',
           'hr': 'hour',
           'cnt': 'y'}


hour = hour.rename(columns=columns)
hour = hour.drop(['instant', 'date'], axis=1)
hour.tail()


cols = ['season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weathersit']
for col in cols:
    hour[col] = hour[col].astype('category')
hour['hour'] = hour['hour'].astype('category')
hour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  category
 1   year        17379 non-null  category
 2   month       17379 non-null  category
 3   hour        17379 non-null  category
 4   holiday     17379 non-null  category
 5   weekday     17379 non-null  category
 6   workingday  17379 non-null  category
 7   weathersit  17379 non-null  category
 8   temp        17379 non-null  float64 
 9   atemp       17379 non-null  float64 
 10  hum         17379 non-null  float64 
 11  windspeed   17379 non-null  float64 
 12  casual      17379 non-null  int64   
 13  registered  17379 non-null  int64   
 14  y           17379 non-null  int64   
dtypes: category(8), float64(4), int64(3)
memory usage: 1.1 MB


day = day.rename(columns=columns)
day['date'] = pd.to_datetime(day['date'], yearfirst=True)
day.index = day['date']
day = day.drop(['instant', 'date'], axis=1)
day.tail()


for col in cols:
    day[col] = day[col].astype('category')
day.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 731 entries, 2011-01-01 to 2012-12-31
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      731 non-null    category
 1   year        731 non-null    category
 2   month       731 non-null    category
 3   holiday     731 non-null    category
 4   weekday     731 non-null    category
 5   workingday  731 non-null    category
 6   weathersit  731 non-null    category
 7   temp        731 non-null    float64 
 8   atemp       731 non-null    float64 
 9   hum         731 non-null    float64 
 10  windspeed   731 non-null    float64 
 11  casual      731 non-null    int64   
 12  registered  731 non-null    int64   
 13  y           731 non-null    int64   
dtypes: category(7), float64(4), int64(3)
memory usage: 52.1 KB


# I choose from column 8 because column 1-7 is categorical
cols = day.keys()[7:]

for c in cols:
    day[c].plot(figsize=(16,5))
    plt.title(c)
    plt.show()


numberic = hour.describe().round(3)
numberic


col_cat = hour.keys()[:8]
col_num = hour.keys()[8:]
print(col_cat)
print(col_num)

Index(['season', 'year', 'month', 'hour', 'holiday', 'weekday', 'workingday',
       'weathersit'],
      dtype='object')
Index(['temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'y'], dtype='object')


for c in col_num:

    # viz for boxplot
    adjust(h=3)
    sns.boxplot(x=hour[c])
    plt.axvline(x=hour[c].mean(), c='r', label='mean')
    plt.legend()
    plt.show()

    # viz for histplot
    adjust(h=3)    
    sns.histplot(hour[c], kde=True)
    plt.axvline(x=hour[c].mean(), c='r', label='mean')
    plt.legend()
    plt.show()


for c in cols:
    stat, p = stats.shapiro(hour[c])
    if p > 0.05: # accept h0
        print(f"{c} probably gaussian")
    else:
        print(f"{c} probably not gaussian")

temp probably not gaussian
atemp probably not gaussian
hum probably not gaussian
windspeed probably not gaussian
casual probably not gaussian
registered probably not gaussian
y probably not gaussian

c:\Users\yandiher\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\stats\_morestats.py:1816: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")


categorical = hour[col_cat].describe()
categorical = categorical.T
categorical['percent_top'] = (categorical['freq']/categorical['count']).round(3)
categorical = categorical.T
categorical


# statements
x = hour['hour']
y = hour['y']
casual = hour['casual']
registered = hour['registered']


def viz(target, hue, title):
    adjust(h=6)
    sns.pointplot(x=x, y=target, hue=hue)
    plt.title(title)
    plt.show()


viz(registered, hour['season'], 'Total of Bike Sharing during Season for registered users')


viz(casual, hour['season'], 'Total of Bike Sharing during Season for casual users')


viz(registered, hour['weekday'], 'Total of Bike Sharing during weekday for registered users')


viz(casual, hour['weekday'], 'Total of Bike Sharing during weekday for casual users')


viz(registered, hour['workingday'], 'Total of Bike Sharing during workingday for registered users')


viz(casual, hour['workingday'], 'Total of Bike Sharing during workingday for casual users')


viz(registered, hour['year'], 'Total of Bike Sharing during 2011 and 2012 for casual users')


viz(casual, hour['year'], 'Total of Bike Sharing during 2011 and 2012 for casual users')


viz(registered, hour['month'], 'Total of Bike Sharing every month for registered users')


viz(casual, hour['month'], 'Total of Bike Sharing every month for casual users')


viz(registered, hour['holiday'], 'Total of Bike Sharing on holiday for registered users')


viz(registered, hour['weathersit'], 'Total of Bike Sharing based on weather for registered users')


viz(casual, hour['weathersit'], 'Total of Bike Sharing based on weather for casual users')


corr = hour.corr().round(3)

adjust(h=7)
sns.heatmap(corr, annot=True)
plt.title('Correlation matrix for all variables')
plt.show()

C:\Users\yandiher\AppData\Local\Temp\ipykernel_1188\2841791570.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr = hour.corr().round(3)


y_label = ['registered', 'casual', 'y']

for y in y_label:
    print(y)
    print(hour[['temp', 'hum', 'windspeed']].corrwith(hour[y]))

registered
temp         0.335361
hum         -0.273933
windspeed    0.082321
dtype: float64
casual
temp         0.459616
hum         -0.347028
windspeed    0.090287
dtype: float64
y
temp         0.404772
hum         -0.322911
windspeed    0.093234
dtype: float64


# drop month and atemp
hour = hour.drop(['month', 'atemp'], axis=1)
hour.tail()


col_cat = hour.keys()[:7]

for c in col_cat:
    adjust()
    ax = sns.barplot(data=hour, x=y, y=c, errorbar=('ci', False))
    ax.bar_label(ax.containers[0])
    plt.title(c)
    plt.show()


adjust()
sns.pointplot(x=hour['hour'], y=hour['casual'], label='casual')
sns.pointplot(x=hour['hour'], y=hour['registered'], label='registered')
sns.pointplot(x=hour['hour'], y=hour['y'], label='y')
plt.legend()
plt.show()


# day.to_csv('day_clean.csv')
# hour.to_csv('hour_clean.csv')


col_cat = hour.keys()[:7]

for c in col_cat:
    adjust()
    ax = sns.barplot(data=hour, x=y, y=c, errorbar=('ci', False))
    ax.bar_label(ax.containers[0])
    plt.title(c)
    plt.show()


for y in y_label:
    print(y)
    print(hour[['temp', 'hum', 'windspeed']].corrwith(hour[y]))

registered
temp         0.335361
hum         -0.273933
windspeed    0.082321
dtype: float64
casual
temp         0.459616
hum         -0.347028
windspeed    0.090287
dtype: float64
y
temp         0.404772
hum         -0.322911
windspeed    0.093234
dtype: float64

	instant	dteday	season	mnth	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	cnt
0	1	2011-01-01	1	1	6	0	2	0.344167	0.363625	0.805833	0.160446	331	654	985
1	2	2011-01-02	1	1	0	0	2	0.363478	0.353739	0.696087	0.248539	131	670	801
2	3	2011-01-03	1	1	1	1	1	0.196364	0.189405	0.437273	0.248309	120	1229	1349
3	4	2011-01-04	1	1	2	1	1	0.200000	0.212122	0.590435	0.160296	108	1454	1562
4	5	2011-01-05	1	1	3	1	1	0.226957	0.229270	0.436957	0.186900	82	1518	1600

	season	year	month	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	y
date
2012-12-27	1	1	12	0	4	1	2	0.254167	0.226642	0.652917	0.350133	247	1867	2114
2012-12-28	1	1	12	0	5	1	2	0.253333	0.255046	0.590000	0.155471	644	2451	3095
2012-12-29	1	1	12	0	6	0	2	0.253333	0.242400	0.752917	0.124383	159	1182	1341
2012-12-30	1	1	12	0	0	0	1	0.255833	0.231700	0.483333	0.350754	364	1432	1796
2012-12-31	1	1	12	0	1	1	2	0.215833	0.223487	0.577500	0.154846	439	2290	2729

	temp	atemp	hum	windspeed	casual	registered	y
count	17379.000	17379.000	17379.000	17379.000	17379.000	17379.000	17379.000
mean	0.497	0.476	0.627	0.190	35.676	153.787	189.463
std	0.193	0.172	0.193	0.122	49.305	151.357	181.388
min	0.020	0.000	0.000	0.000	0.000	0.000	1.000
25%	0.340	0.333	0.480	0.104	4.000	34.000	40.000
50%	0.500	0.485	0.630	0.194	17.000	115.000	142.000
75%	0.660	0.621	0.780	0.254	48.000	220.000	281.000
max	1.000	1.000	1.000	0.851	367.000	886.000	977.000

	season	year	month	hour	holiday	weekday	workingday	weathersit
count	17379.000	17379.000	17379.000	17379.000	17379.000	17379.000	17379.000	17379.000
unique	4.000	2.000	12.000	24.000	2.000	7.000	2.000	4.000
top	3.000	1.000	5.000	17.000	0.000	6.000	1.000	1.000
freq	4496.000	8734.000	1488.000	730.000	16879.000	2512.000	11865.000	11413.000
percent_top	0.259	0.503	0.086	0.042	0.971	0.145	0.683	0.657

	season	year	hour	weekday	workingday	weathersit	temp	hum	windspeed	casual	registered	y
17374	1	1	19	1	1	2	0.26	0.60	0.1642	11	108	119
17375	1	1	20	1	1	2	0.26	0.60	0.1642	8	81	89
17376	1	1	21	1	1	1	0.26	0.60	0.1642	7	83	90
17377	1	1	22	1	1	1	0.26	0.56	0.1343	13	48	61
17378	1	1	23	1	1	1	0.26	0.65	0.1343	12	37	49

Menentukan Pertanyaan Bisnis¶

Menyaipkan semua library yang dibuthkan¶

Data Wrangling¶

Gathering Data¶

hour¶

day¶

Assessing Data¶

hour¶

day¶

Cleaning Data¶

hour¶

day¶

Exploratory Data Analysis (EDA)¶

Explore ...¶

day¶

hour¶

numeric¶

categorical¶

Visualization & Explanatory Analysis¶

Pertanyaan 1:¶

Pertanyaan 2:¶

Conclusion¶

	instant	dteday	season	mnth	hr	weekday	weathersit	temp	atemp	hum	casual	registered	cnt
0	1	2011-01-01	1	1	0	6	1	0.24	0.2879	0.81	3	13	16
1	2	2011-01-01	1	1	1	6	1	0.22	0.2727	0.80	8	32	40
2	3	2011-01-01	1	1	2	6	1	0.22	0.2727	0.80	5	27	32
3	4	2011-01-01	1	1	3	6	1	0.24	0.2879	0.75	3	10	13
4	5	2011-01-01	1	1	4	6	1	0.24	0.2879	0.75	0	1	1

	season	year	month	hour	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	y
17374	1	1	12	19	1	1	2	0.26	0.2576	0.60	0.1642	11	108	119
17375	1	1	12	20	1	1	2	0.26	0.2576	0.60	0.1642	8	81	89
17376	1	1	12	21	1	1	1	0.26	0.2576	0.60	0.1642	7	83	90
17377	1	1	12	22	1	1	1	0.26	0.2727	0.56	0.1343	13	48	61
17378	1	1	12	23	1	1	1	0.26	0.2727	0.65	0.1343	12	37	49

Proyek Analisis Data: Bike Sharing Dataset¶

Menentukan Pertanyaan Bisnis¶

Menyaipkan semua library yang dibuthkan¶

Data Wrangling¶

Gathering Data¶

hour¶

day¶

Assessing Data¶

hour¶

day¶

Cleaning Data¶

hour¶

day¶

Exploratory Data Analysis (EDA)¶

Explore ...¶

day¶

hour¶

numeric¶

categorical¶

Visualization & Explanatory Analysis¶

Pertanyaan 1:¶

Pertanyaan 2:¶

Conclusion¶