Импорт необходимых библиотек
import matplotlib.pyplot as plt import seaborn as sns import scipy import re import missingno as mso from scipy import stats from scipy.stats import ttest_ind from scipy.stats import pearsonr from sklearn.preprocessing import StandardScaler,LabelEncoder from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import GradientBoostingClassifier from xgboost import XGBClassifier from sklearn.metrics import accuracy_score,confusion_matrix,classification_report import pandas as pd
Чтение CSV-файла
data=pd.read_csv("/content/seattle-weather.csv") data.head()
Форма данных
data.shape
(1461, 6)
import warnings warnings.filterwarnings('ignore') sns.countplot("weather",data=data,palette='hls')
countrain=len(data[data.weather=='rain']) countsun=len(data[data.weather=='sun']) countdrizzle=len(data[data.weather=='drizzle']) countsnow=len(data[data.weather=='snow']) countfog=len(data[data.weather=='fog']) print('percent of rain:{:2f}%'.format((countrain/(len(data.weather))*100))) print('percent of sun:{:2f}%'.format((countsun/(len(data.weather))*100))) print('percent of drizzle:{:2f}%'.format((countdrizzle/(len(data.weather))*100))) print('percent of snow:{:2f}%'.format((countsnow/(len(data.weather))*100))) print('percent of fog:{:2f}%'.format((countfog/(len(data.weather))*100)))
data[['precipitation','temp_max','temp_min','wind']].describe()
sns.set(style='darkgrid') fig,axs=plt.subplots(2,2,figsize=(10,8)) sns.histplot(data=data,x='precipitation',kde=True,ax=axs[0,0],color='green') sns.histplot(data=data,x='temp_max',kde=True,ax=axs[0,1],color='red') sns.histplot(data=data,x='temp_min',kde=True,ax=axs[1,0],color='blue') sns.histplot(data=data,x='wind',kde=True,ax=axs[1,1],color='orange')
sns.set(style='darkgrid') fig,axs=plt.subplots(2,2,figsize=(10,8)) sns.violinplot(data=data,x='precipitation',kde=True,ax=axs[0,0],color='green') sns.violinplot(data=data,x='temp_max',kde=True,ax=axs[0,1],color='red') sns.violinplot(data=data,x='temp_min',kde=True,ax=axs[1,0],color='blue') sns.violinplot(data=data,x='wind',kde=True,ax=axs[1,1],color='orange')
plt.figure(figsize=(12,6)) sns.boxplot('precipitation','weather',data=data,palette='YlOrBr')
plt.figure(figsize=(12,6)) sns.boxplot('temp_max','weather',data=data,palette='inferno')
plt.figure(figsize=(12,6)) sns.boxplot('wind','weather',data=data,palette='YlOrBr')
plt.figure(figsize=(12,6)) sns.boxplot('temp_min','weather',data=data,palette='YlOrBr')
plt.figure(figsize=(12,6)) sns.heatmap(data.corr(),annot=True,cmap='coolwarm')
data.plot("precipitation",'temp_max',style='o') print('pearsons correlation: ',data['precipitation'].corr(data['temp_max'])) print('T test and P value: ',stats.ttest_ind(data['precipitation'],data['temp_max']))
Корреляция Пирсона: -0,22855481643297046
Тест T и значение P: Ttest_indResult (статистика = -51,60685279531918, pvalue = 0,0)
data.plot("wind",'temp_max',style='o') print('pearsons correlation: ',data['wind'].corr(data['temp_max'])) print('T test and P value: ',stats.ttest_ind(data['wind'],data['temp_max']))
Корреляция Пирсона: -0,16485663487495486
Тест T и значение P: Ttest_indResult (статистика = -67,3601643301846, pvalue = 0,0)
data.plot('temp_max','temp_min',style='o')
data.isna().sum()
plt.figure(figsize=(12,6)) axz=plt.subplot(1,2,2) mso.bar(data.drop(['date'],axis=1),ax=axz,fontsize=12)
data=data.drop(['date'],axis=1) Q1=data.quantile(0.25) Q3=data.quantile(0.75) IQR=Q3-Q1 data=data[~((data<(Q1-1.5*IQR))|(data>(Q3+1.5*IQR))).any(axis=1)] import numpy as np data.precipitation=np.sqrt(data.precipitation) data.wind=np.sqrt(data.wind) sns.set(style='darkgrid') fig, axs=plt.subplots(2,2,figsize=(10,8)) sns.histplot(data=data,x="precipitation",kde=True,ax=axs[0,0],color='green') sns.histplot(data=data,x="temp_max",kde=True,ax=axs[0,1],color='red') sns.histplot(data=data,x="temp_min",kde=True,ax=axs[1,0],color='blue') sns.histplot(data=data,x="wind",kde=True,ax=axs[1,1],color='orange')
data.head()
lc=LabelEncoder() data['weather']=lc.fit_transform(data['weather']) data.head()
x=((data.loc[:,data.columns!='weather']).astype(int)).values[:,0:] y=data['weather'].values data.weather.unique()
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=2) knn=KNeighborsClassifier() knn.fit(x_train,y_train) print('KNN accuracy:{:.2f}%'.format(knn.score(x_test,y_test)*100))
svm=SVC() svm.fit(x_train,y_train) print('SVM accuracy:{:.2f}%'.format(svm.score(x_test,y_test)*100))
gbc=GradientBoostingClassifier(subsample=0.5,n_estimators=450,max_depth=5,max_leaf_nodes=25) gbc.fit(x_train,y_train) print('GBC accuracy:{:.2f}%'.format(gbc.score(x_test,y_test)*100))
import warnings warnings.filterwarnings('ignore') xgb=XGBClassifier() xgb.fit(x_train,y_train) print('XGB accuracy:{:.2f}%'.format(xgb.score(x_test,y_test)*100))
input=[[1.140175,8.9,2.8,2.469818]] ot=xgb.predict(input) print('the weather is:') if(ot==0): print('Drizzle') elif (ot==1): print('fogg') elif (ot==2): print('rain') elif (ot==3): print('snow') else: print('sun')
import pickle file = 'model.pkl' pickle.dump(xgb, open(file, 'wb'))