import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as s


dataset = pd.read_csv('penjualan_boci.csv')
dataset.drop(dataset.columns[0], axis=1, inplace=True)
dataset.head()


dataset.isna().sum()

date       0
weather    0
day        0
sales      0
dtype: int64


dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   date     90 non-null     object 
 1   weather  90 non-null     object 
 2   day      90 non-null     object 
 3   sales    90 non-null     float64
dtypes: float64(1), object(3)
memory usage: 2.9+ KB


dataset.index = pd.to_datetime(dataset.date)
dataset = dataset.drop(['date'], axis=1)
dataset.head()


print(dataset['weather'].describe())

count        90
unique        2
top       rainy
freq         46
Name: weather, dtype: object


print(dataset['day'].describe())

count         90
unique         5
top       Monday
freq          18
Name: day, dtype: object


print(dataset['sales'].describe().round(3))

count    90.000
mean     79.311
std       5.335
min      68.000
25%      76.000
50%      79.000
75%      83.000
max      92.000
Name: sales, dtype: float64


plt.figure(figsize=(16,5))
sns.boxplot(x=dataset.sales,
            color='cyan',
            showmeans=True, 
            meanprops={'marker':'o',
                       'markerfacecolor':'r'})
plt.title('boci sales in boxplot')
plt.show()


min_value = dataset['sales'].min()
max_value = dataset['sales'].max()
std_value = dataset['sales'].std()
mean_value = dataset['sales'].mean()
q3 = np.percentile(dataset['sales'], 75)
q1 = np.percentile(dataset['sales'], 25)
iqr = q3-q1


iqr_min_threshold = q1-(1.5*iqr)
iqr_max_threshold = q3+(1.5*iqr)
std_min_threshold = mean_value-(std_value*3)
std_max_threshold = mean_value+(std_value*3)


# find bins
k = 1+3.322*np.log(dataset['sales'].shape)
k

array([15.94836772])


plt.figure(figsize=(16,5))
sns.histplot(dataset.sales, kde=True, bins=int(k), color='cyan')
plt.title('histogram of boci sales')
plt.show()


def outliers_detection(value, threshold, check):
    if check == 'min':
        if value < threshold:
            print(f"there is ouliers detected in {check} value")
        else:
            print(f"there is no ouliers detected in {check} value")
    elif check == 'max':
        if value > threshold:
            print(f"there is ouliers detected in {check} value")
        else:
            print(f"there is no ouliers detected in {check} value")


print('outliers from iqr')
outliers_detection(min_value, iqr_min_threshold, check='min')
outliers_detection(max_value, iqr_max_threshold, check='max')
print("")
print('outliers from std')
outliers_detection(min_value, std_min_threshold, check='min')
outliers_detection(min_value, std_max_threshold, check='max')

outliers from iqr
there is no ouliers detected in min value
there is no ouliers detected in max value

outliers from std
there is no ouliers detected in min value
there is no ouliers detected in max value


mean, var, std = s.bayes_mvs(data=dataset['sales'], alpha=0.95)
s.bayes_mvs(data=dataset['sales'], alpha=0.95)

(Mean(statistic=79.31111111111112, minmax=(78.19368372079686, 80.42853850142538)),
 Variance(statistic=29.118263090676876, minmax=(21.65406291572004, 39.09796977378829)),
 Std_dev(statistic=5.380650553108442, minmax=(4.653392624281777, 6.252836938045665)))


print(f"based on boxplot, 50% of sales lies between {q1} and {q3}.")

based on boxplot, 50% of sales lies between 76.0 and 83.0.


print(f"based on empirical rule, 68% of sales lies between {mean_value-std_value} and {mean_value+std_value}.")

based on empirical rule, 68% of sales lies between 73.97595224750873 and 84.6462699747135.


print(f"after 90 days of selling, he can describe that the average of selling boci is {mean_value}.")
print(f"and he is 95% confident that when he collects more data, the average will lie between {mean[1][0]} and {mean[1][1]}.")

after 90 days of selling, he can describe that the average of selling boci is 79.31111111111112.
and he is 95% confident that when he collects more data, the average will lie between 78.19368372079686 and 80.42853850142538.


dataset['sales'].info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 90 entries, 2024-01-01 to 2024-05-03
Series name: sales
Non-Null Count  Dtype  
--------------  -----  
90 non-null     float64
dtypes: float64(1)
memory usage: 1.4 KB


rainy = dataset[dataset['weather'] == 'rainy']['sales']
sunny = dataset[dataset['weather'] == 'sunny']['sales']


dataset.index.has_duplicates

False


def gaussian_check(array):
    stat, p = s.shapiro(array)
    if p > 0.05:
        print(f"the data is normally distributed with p-value {p}.")
    else:
        print(f"the data is not normally distributed with p-value {p}.")


gaussian_check(sunny)
gaussian_check(rainy)

the data is normally distributed with p-value 0.11649647355079651.
the data is normally distributed with p-value 0.38656672835350037.


def variance_check(array_1, array_2):
    stat, p = s.levene(array_1, array_2)
    if p > 0.05:
        print(f"the data has equal variance with p-value {p}.")
    else:
        print(f"the data violates equal variance with p-value {p}.")


variance_check(sunny, rainy)

the data has equal variance with p-value 0.2188004442901671.


stat, p = s.ttest_ind(rainy, sunny)


if p > 0.05:
    print(f"there is no significant difference between rainy and sunny with p-value {p}.")
else:
    print(f"there is significant difference between rainy and sunny with p-value {p}.")

there is significant difference between rainy and sunny with p-value 8.08519982507261e-06.


stat, p = s.ttest_ind(rainy, sunny, alternative='greater')


if p > 0.05:
    print(f"we fail to reject that he has more sales when the weather is sunny than rainy with p-value {p}.")
else:
    print(f"we reject that he has more sales when the weather is sunny than rainy with p-value {p}.")

we reject that he has more sales when the weather is sunny than rainy with p-value 4.042599912536305e-06.

if p > 0.05: # fail to reject null hypothesis
    print(f"rainy performs less or equal to sunny with p-value {p}.")
else: # reject null hyphotesis
    print(f"rainy performs greater than sunny with p-value {p}.")


rainy_mean = rainy.mean()
sunny_mean = sunny.mean()


print('it is statistically significant that rainy performs more sales than sunny.')
print(f"the average sales when it is rainy: {rainy_mean}.")
print(f"the average sales when it is sunny: {sunny_mean}.")
print(f"the difference between rainy and sunny is {np.abs(rainy_mean-sunny_mean)}")

it is statistically significant that rainy performs more sales than sunny.
the average sales when it is rainy: 81.65217391304348.
the average sales when it is sunny: 76.86363636363636.
the difference between rainy and sunny is 4.788537549407124


plt.figure(figsize=(16,5))
sns.boxplot(data=dataset, 
            x='sales', 
            y='weather', 
            color='cyan',
            showmeans=True, 
            meanprops={'marker':'o',
                       'markerfacecolor':'r'})
plt.title('boxplot rainy vs sunny')
plt.show()

background¶

library and dataset¶

data cleaning¶

exploration data analysis¶

coding¶

findings¶

hypothesis testing¶

assumption check¶

primary analysis¶

actionable insight¶

suggestion¶

	date	weather	day	sales
0	2024-01-01	sunny	Monday	79.0
1	2024-01-02	rainy	Tuesday	80.0
2	2024-01-03	rainy	Wednesday	84.0
3	2024-01-04	rainy	Thursday	78.0
4	2024-01-05	sunny	Friday	83.0

	weather	day	sales
date
2024-01-01	sunny	Monday	79.0
2024-01-02	rainy	Tuesday	80.0
2024-01-03	rainy	Wednesday	84.0
2024-01-04	rainy	Thursday	78.0
2024-01-05	sunny	Friday	83.0