''' The following code is for the Linear Regression Created by- ANALYTICS VIDHYA '''
# importing required libraries import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error
# read the train and test dataset train_data = pd.read_csv('train.csv') test_data = pd.read_csv('test.csv')
print(train_data.head())
# shape of the dataset print('\nShape of training data :',train_data.shape) print('\nShape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Item_Outlet_Sales
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Item_Outlet_Sales'],axis=1) train_y = train_data['Item_Outlet_Sales']
# seperate the independent and target variable on training data test_x = test_data.drop(columns=['Item_Outlet_Sales'],axis=1) test_y = test_data['Item_Outlet_Sales']
''' Create the object of the Linear Regression model You can also add other parameters and test your code here Some parameters are : fit_intercept and normalize Documentation of sklearn LinearRegression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html ''' model = LinearRegression()
# fit the model with the training data model.fit(train_x,train_y)
# coefficeints of the trained model print('\nCoefficient of model :', model.coef_)
# intercept of the model print('\nIntercept of model',model.intercept_)
# predict the target on the test dataset predict_train = model.predict(train_x) print('\nItem_Outlet_Sales on training data',predict_train)
# Root Mean Squared Error on training dataset rmse_train = mean_squared_error(train_y,predict_train)**(0.5) print('\nRMSE on train dataset : ', rmse_train)
# predict the target on the testing dataset predict_test = model.predict(test_x) print('\nItem_Outlet_Sales on test data',predict_test)
# Root Mean Squared Error on testing dataset rmse_test = mean_squared_error(test_y,predict_test)**(0.5) print('\nRMSE on test dataset : ', rmse_test)
''' The following code is for Logistic Regression Created by - ANALYTICS VIDHYA '''
# importing required libraries import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
print(train_data.head())
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the Logistic Regression model You can also add other parameters and test your code here Some parameters are : fit_intercept and penalty Documentation of sklearn LogisticRegression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html ''' model = LogisticRegression()
# fit the model with the training data model.fit(train_x,train_y)
# coefficeints of the trained model print('Coefficient of model :', model.coef_)
# intercept of the model print('Intercept of model',model.intercept_)
# predict the target on the train dataset predict_train = model.predict(train_x) print('Target on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('accuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('Target on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('accuracy_score on test dataset : ', accuracy_test)
''' The following code is for Decision Tree Created by - Analytics Vidhya '''
# importing required libraries import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the Decision Tree model You can also add other parameters and test your code here Some parameters are : max_depth and max_features Documentation of sklearn DecisionTreeClassifier: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html ''' model = DecisionTreeClassifier()
# fit the model with the training data model.fit(train_x,train_y)
# depth of the decision tree print('Depth of the Decision Tree :', model.get_depth())
# predict the target on the train dataset predict_train = model.predict(train_x) print('Target on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('accuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('Target on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('accuracy_score on test dataset : ', accuracy_test)
''' The following code is for Support Vector Machines Created by - ANALYTICS VIDHYA ''' # importing required libraries import pandas as pd from sklearn.svm import SVC from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the Support Vector Classifier model You can also add other parameters and test your code here Some parameters are : kernal and degree Documentation of sklearn Support Vector Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html ''' model = SVC()
# fit the model with the training data model.fit(train_x,train_y)
# predict the target on the train dataset predict_train = model.predict(train_x) print('Target on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('accuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('Target on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('accuracy_score on test dataset : ', accuracy_test)
''' The following code is for Naive Bayes Created by - ANALYTICS VIDHYA '''
# importing required libraries import pandas as pd from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the Naive Bayes model You can also add other parameters and test your code here Some parameters are : var_smoothing Documentation of sklearn GaussianNB: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html ''' model = GaussianNB()
# fit the model with the training data model.fit(train_x,train_y)
# predict the target on the train dataset predict_train = model.predict(train_x) print('Target on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('accuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('Target on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('accuracy_score on test dataset : ', accuracy_test)
''' The following code is for the K-Nearest Neighbors Created by - ANALYTICS VIDHYA ''' # importing required libraries import pandas as pd from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the K-Nearest Neighbor model You can also add other parameters and test your code here Some parameters are : n_neighbors, leaf_size Documentation of sklearn K-Neighbors Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html ''' model = KNeighborsClassifier()
# fit the model with the training data model.fit(train_x,train_y)
# Number of Neighbors used to predict the target print('\nThe number of neighbors used to predict the target : ',model.n_neighbors)
# predict the target on the train dataset predict_train = model.predict(train_x) print('\nTarget on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('accuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('Target on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('accuracy_score on test dataset : ', accuracy_test)
''' The following code is for the K-Means Created by - ANALYTICS VIDHYA '''
# importing required libraries import pandas as pd from sklearn.cluster import KMeans
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to divide the training data into differernt clusters # and predict in which cluster a particular data point belongs.
''' Create the object of the K-Means model You can also add other parameters and test your code here Some parameters are : n_clusters and max_iter Documentation of sklearn KMeans: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html '''
model = KMeans()
# fit the model with the training data model.fit(train_data)
# Number of Clusters print('\nDefault number of Clusters : ',model.n_clusters)
# predict the clusters on the train dataset predict_train = model.predict(train_data) print('\nCLusters on train data',predict_train)
# predict the target on the test dataset predict_test = model.predict(test_data) print('Clusters on test data',predict_test)
# Now, we will train a model with n_cluster = 3 model_n3 = KMeans(n_clusters=3)
# fit the model with the training data model_n3.fit(train_data)
# Number of Clusters print('\nNumber of Clusters : ',model_n3.n_clusters)
# predict the clusters on the train dataset predict_train_3 = model_n3.predict(train_data) print('\nCLusters on train data',predict_train_3)
# predict the target on the test dataset predict_test_3 = model_n3.predict(test_data) print('Clusters on test data',predict_test_3)
''' The following code is for the Random Forest Created by - ANALYTICS VIDHYA '''
# importing required libraries import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# view the top 3 rows of the dataset print(train_data.head(3))
# shape of the dataset print('\nShape of training data :',train_data.shape) print('\nShape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the Random Forest model You can also add other parameters and test your code here Some parameters are : n_estimators and max_depth Documentation of sklearn RandomForestClassifier: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html ''' model = RandomForestClassifier()
# fit the model with the training data model.fit(train_x,train_y)
# number of trees used print('Number of Trees used : ', model.n_estimators)
# predict the target on the train dataset predict_train = model.predict(train_x) print('\nTarget on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('\naccuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('\nTarget on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('\naccuracy_score on test dataset : ', accuracy_test)
''' The following code is for Principal Component Analysis (PCA) Created by - ANALYTICS VIDHYA ''' # importing required libraries import pandas as pd from sklearn.decomposition import PCA from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error
# read the train and test dataset train_data = pd.read_csv('train.csv') test_data = pd.read_csv('test.csv')
# view the top 3 rows of the dataset print(train_data.head(3))
# shape of the dataset print('\nShape of training data :',train_data.shape) print('\nShape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data # target variable - Item_Outlet_Sales train_x = train_data.drop(columns=['Item_Outlet_Sales'],axis=1) train_y = train_data['Item_Outlet_Sales']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Item_Outlet_Sales'],axis=1) test_y = test_data['Item_Outlet_Sales']
print('\nTraining model with {} dimensions.'.format(train_x.shape[1]))
# create object of model model = LinearRegression()
# fit the model with the training data model.fit(train_x,train_y)
# predict the target on the train dataset predict_train = model.predict(train_x)
# Accuray Score on train dataset rmse_train = mean_squared_error(train_y,predict_train)**(0.5) print('\nRMSE on train dataset : ', rmse_train)
# predict the target on the test dataset predict_test = model.predict(test_x)
# Accuracy Score on test dataset rmse_test = mean_squared_error(test_y,predict_test)**(0.5) print('\nRMSE on test dataset : ', rmse_test)
# create the object of the PCA (Principal Component Analysis) model # reduce the dimensions of the data to 12 ''' You can also add other parameters and test your code here Some parameters are : svd_solver, iterated_power Documentation of sklearn PCA: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html ''' model_pca = PCA(n_components=12)
print('\nTraining model with {} dimensions.'.format(new_train.shape[1]))
# create object of model model_new = LinearRegression()
# fit the model with the training data model_new.fit(new_train,train_y)
# predict the target on the new train dataset predict_train_pca = model_new.predict(new_train)
# Accuray Score on train dataset rmse_train_pca = mean_squared_error(train_y,predict_train_pca)**(0.5) print('\nRMSE on new train dataset : ', rmse_train_pca)
# predict the target on the new test dataset predict_test_pca = model_new.predict(new_test)
# Accuracy Score on test dataset rmse_test_pca = mean_squared_error(test_y,predict_test_pca)**(0.5) print('\nRMSE on new test dataset : ', rmse_test_pca)
''' The following code is for Gradient Boosting Created by - ANALYTICS VIDHYA '''
# importing required libraries import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the GradientBoosting Classifier model You can also add other parameters and test your code here Some parameters are : learning_rate, n_estimators Documentation of sklearn GradientBoosting Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html ''' model = GradientBoostingClassifier(n_estimators=100,max_depth=5)
# fit the model with the training data model.fit(train_x,train_y)
# predict the target on the train dataset predict_train = model.predict(train_x) print('\nTarget on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('\naccuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('\nTarget on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('\naccuracy_score on test dataset : ', accuracy_test)
''' The following code is for XGBoost Created by - ANALYTICS VIDHYA '''
# importing required libraries import pandas as pd from xgboost import XGBClassifier from sklearn.metrics import accuracy_score
# read the train and test dataset train_data = pd.read_csv('train-data.csv') test_data = pd.read_csv('test-data.csv')
# shape of the dataset print('Shape of training data :',train_data.shape) print('Shape of testing data :',test_data.shape)
# Now, we need to predict the missing target variable in the test data # target variable - Survived
# seperate the independent and target variable on training data train_x = train_data.drop(columns=['Survived'],axis=1) train_y = train_data['Survived']
# seperate the independent and target variable on testing data test_x = test_data.drop(columns=['Survived'],axis=1) test_y = test_data['Survived']
''' Create the object of the XGBoost model You can also add other parameters and test your code here Some parameters are : max_depth and n_estimators Documentation of xgboost: https://xgboost.readthedocs.io/en/latest/ ''' model = XGBClassifier()
# fit the model with the training data model.fit(train_x,train_y)
# predict the target on the train dataset predict_train = model.predict(train_x) print('\nTarget on train data',predict_train)
# Accuray Score on train dataset accuracy_train = accuracy_score(train_y,predict_train) print('\naccuracy_score on train dataset : ', accuracy_train)
# predict the target on the test dataset predict_test = model.predict(test_x) print('\nTarget on test data',predict_test)
# Accuracy Score on test dataset accuracy_test = accuracy_score(test_y,predict_test) print('\naccuracy_score on test dataset : ', accuracy_test)
#Read training and testing files train = pd.read_csv("train.csv") test = pd.read_csv("test.csv")
#Imputing missing values for both train and test train.fillna(-999, inplace=True) test.fillna(-999,inplace=True)
#Creating a training set for modeling and validation set to check model performance X = train.drop(['Item_Outlet_Sales'], axis=1) y = train.Item_Outlet_Sales
from sklearn.model_selection import train_test_split
#importing library and building model from catboost import CatBoostRegressormodel=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')