import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#reading file 
df1 = pd.read_excel('C:/Users/Acer/Downloads/Project/SuperStore Sales DataSet.xlsx')

df1.head()

#Dropping Multiple Rows
df1.drop('Row ID',axis=1,inplace=True)

df1.head()

# Dropping the multiple columns togethor
df1.drop(['Order Date'],axis=1,inplace=True)

df1.head()

# Drop Rows (axis=0 for rows)
df1.drop(1,axis=0,inplace=True)
df1.head()

# Dropping Multiple Rows
df1.drop([2,3],axis =0,inplace=True)
df1.head()

import random

cities = ["Faridabad", "Delhi", "Noida", "Mumbai", "Bangalore", "Chennai", "Hyderabad", "Pune", "Jaipur", "Kolkata"]
ranks = ["1st", "2nd", "3rd", "4th", "5th"]

raw_data = {
    "city": [random.choice(cities) for _ in range(60)],
    "rank": [random.choice(ranks) for _ in range(60)],
    "score1": [random.randint(30, 80) for _ in range(60)],
    "score2": [random.randint(50, 100) for _ in range(60)]
}

print(raw_data)
df2=pd.DataFrame(raw_data,columns=["city","rank","score1","score2"])

{'city': ['Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Bangalore', 'Pune', 'Kolkata', 'Faridabad', 'Hyderabad', 'Hyderabad', 'Hyderabad', 'Hyderabad', 'Pune', 'Faridabad', 'Kolkata', 'Bangalore', 'Hyderabad', 'Pune', 'Delhi', 'Kolkata', 'Jaipur', 'Kolkata', 'Mumbai', 'Jaipur', 'Noida', 'Bangalore', 'Bangalore', 'Chennai', 'Jaipur', 'Jaipur', 'Jaipur', 'Mumbai', 'Jaipur', 'Kolkata', 'Kolkata', 'Mumbai', 'Delhi', 'Pune', 'Chennai', 'Hyderabad', 'Mumbai', 'Bangalore', 'Pune', 'Jaipur', 'Jaipur', 'Bangalore', 'Faridabad', 'Hyderabad', 'Chennai', 'Kolkata', 'Mumbai', 'Faridabad', 'Delhi', 'Mumbai', 'Chennai', 'Faridabad', 'Faridabad', 'Noida', 'Faridabad', 'Kolkata'], 'rank': ['3rd', '1st', '1st', '2nd', '5th', '3rd', '3rd', '4th', '5th', '4th', '4th', '5th', '1st', '5th', '1st', '2nd', '1st', '1st', '1st', '3rd', '3rd', '2nd', '3rd', '2nd', '2nd', '4th', '4th', '5th', '1st', '4th', '5th', '1st', '3rd', '1st', '5th', '1st', '4th', '2nd', '2nd', '3rd', '4th', '2nd', '4th', '5th', '1st', '3rd', '2nd', '1st', '5th', '3rd', '1st', '3rd', '2nd', '2nd', '1st', '3rd', '3rd', '3rd', '5th', '5th'], 'score1': [50, 74, 59, 67, 63, 48, 32, 48, 54, 80, 72, 75, 61, 58, 70, 35, 40, 51, 47, 77, 76, 37, 45, 41, 30, 70, 60, 41, 61, 73, 41, 32, 66, 71, 70, 40, 57, 66, 43, 65, 61, 73, 65, 39, 52, 72, 67, 40, 70, 44, 80, 74, 79, 58, 62, 32, 36, 52, 43, 38], 'score2': [53, 80, 90, 51, 73, 100, 89, 65, 72, 84, 98, 67, 54, 100, 51, 87, 100, 67, 86, 94, 57, 54, 59, 66, 70, 82, 76, 92, 65, 76, 66, 97, 65, 63, 65, 54, 83, 94, 96, 82, 76, 84, 56, 69, 88, 53, 62, 55, 51, 59, 74, 62, 85, 51, 88, 96, 76, 55, 52, 76]}

df2

#check for duplicate data
df2.duplicated()
#So, thereisno duplicate row in a dataframe

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
51    False
52    False
53    False
54    False
55    False
56    False
57    False
58    False
59    False
dtype: bool

#check for duplicate rows in city
df2.duplicated(['city'])
#the first occurrences of data also treated as False

0     False
1     False
2     False
3     False
4      True
5      True
6     False
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20    False
21     True
22    False
23     True
24    False
25     True
26     True
27    False
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42     True
43     True
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
52     True
53     True
54     True
55     True
56     True
57     True
58     True
59     True
dtype: bool

df2.duplicated(['rank'])
#the first occurrences of data also treated as False

0     False
1     False
2      True
3     False
4     False
5      True
6      True
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42     True
43     True
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
52     True
53     True
54     True
55     True
56     True
57     True
58     True
59     True
dtype: bool

df2.duplicated(['rank'],keep='last')
# the last occurances is treated as True, when we classify keep as last

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42    False
43     True
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
52     True
53    False
54    False
55     True
56     True
57    False
58     True
59    False
dtype: bool

#checking duplicate values on the basis of combinations
df2.duplicated(['city','rank'])

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11     True
12    False
13    False
14    False
15    False
16     True
17     True
18    False
19     True
20    False
21    False
22    False
23    False
24    False
25    False
26     True
27    False
28    False
29    False
30    False
31    False
32     True
33     True
34    False
35     True
36    False
37     True
38    False
39    False
40    False
41     True
42    False
43     True
44     True
45    False
46    False
47     True
48     True
49     True
50     True
51    False
52    False
53    False
54    False
55     True
56     True
57    False
58     True
59     True
dtype: bool

df2.drop_duplicates()
# No rows are dropped because there is no duplicate rows

df2.drop_duplicates(['city'])
#all the duplicate cities are dropped

df2.drop_duplicates(['city','rank'])
# all the duplicate rows are dropped on the basis of combination of city and␣rank column

df.describe()

df1.head()

df1['Profit'].describe()

count    5898.000000
mean       29.692769
std       259.654675
min     -6599.978000
25%         1.794375
50%         8.502250
75%        28.436400
max      8399.976000
Name: Profit, dtype: float64

import seaborn as sns

sns.boxplot(x=df1['Profit'])

<Axes: xlabel='Profit'>

sns.boxplot(x=df1['Sales'])

<Axes: xlabel='Sales'>

sns.boxplot(x=df1['Quantity'])

<Axes: xlabel='Quantity'>

df1.shape

(5898, 21)

# Let's calculate first quartile,third_quartile and inter_quartile range
first_quartile = df1['Profit'].quantile(.25)
third_quartile = df1['Profit'].quantile(.75)
IQR = third_quartile - first_quartile

new_boundary = third_quartile + 3 * IQR

new_boundary = third_quartile + 3 * IQR
# dropped the outliers data
new_data = df1.drop(df1[df1['Profit']>new_boundary].index, axis = 0,inplace=False)
# 12 rows are dropped
new_data.shape

(5423, 21)

sns.boxplot(x=new_data['Profit'])

<Axes: xlabel='Profit'>

# Let's calculate first quartile,third_quartile and inter_quartile range
first_quartile1 = df1['Quantity'].quantile(.25)
third_quartile1 = df1['Quantity'].quantile(.75)
IQR = third_quartile - first_quartile

new_boundary1 = third_quartile1 + 3 * IQR

# dropped the outliers data
new_data1 = df1.drop(df1[df1['Quantity']>new_boundary1].index, axis = 0,inplace=False)
# 12 rows are dropped
new_data1.shape

(5898, 21)

sns.boxplot(x=new_data1['Quantity'])

<Axes: xlabel='Quantity'>

import random

# List of names (50 names)
names = [
    'Mohan', 'Maria', 'Deepak', 'Kunal', 'Piyush', 'Avinash', 'Lisa', 'Smita', 'Tanu', 'Khusboo',
    'Nishant', 'Johnson', 'Donald', 'Rakesh', 'Pritvi', 'Roy', 'Ashish', 'Abhishek', 'Jassi', 'Nikita',
    'Aman', 'Divya', 'Sneha', 'Ritu', 'Anuj', 'Chetan', 'Saurabh', 'Rohit', 'Gaurav', 'Meena',
    'Seema', 'Priya', 'Alok', 'Sanya', 'Nilesh', 'Vikas', 'Ankita', 'Suraj', 'Bhavna', 'Kiran',
    'Simran', 'Radhika', 'Vivek', 'Pranav', 'Hemant', 'Aditi', 'Komal', 'Tarun', 'Neha', 'Ishaan'
]

# Generate random realistic heights between 1.4m and 2.1m
heights = [round(random.uniform(1.4, 2.1), 2) for _ in range(50)]

# Create the DataFrame
raw = {
    'name': names,
    'height': heights
}

df3 = pd.DataFrame(raw)

df3

df3.describe()

import pandas as pd
import random
import numpy as np

# Names (50 different names)
names = [
    'Mohan', 'Maria', 'Deepak', 'Kunal', 'Piyush', 'Avinash', 'Lisa', 'Smita', 'Tanu', 'Khusboo',
    'Nishant', 'Johnson', 'Donald', 'Rakesh', 'Pritvi', 'Roy', 'Ashish', 'Abhishek', 'Jassi', 'Nikita',
    'Aman', 'Divya', 'Sneha', 'Ritu', 'Anuj', 'Chetan', 'Saurabh', 'Rohit', 'Gaurav', 'Meena',
    'Seema', 'Priya', 'Alok', 'Sanya', 'Nilesh', 'Vikas', 'Ankita', 'Suraj', 'Bhavna', 'Kiran',
    'Simran', 'Radhika', 'Vivek', 'Pranav', 'Hemant', 'Aditi', 'Komal', 'Tarun', 'Neha', 'Ishaan'
]

# Generate 45 "normal" heights around 5.5-6.5 range
normal_heights = [round(random.uniform(4.5, 7.0), 2) for _ in range(45)]

# Add 5 "outlier" very large heights
outliers = [round(random.uniform(12.0, 40.0), 2) for _ in range(5)]

# Combine and shuffle
heights = normal_heights + outliers
random.shuffle(heights)

# Create DataFrame
raw = {
    'name': names,
    'height': heights
}

df4 = pd.DataFrame(raw)

df4.describe()

# Calculating the Q1 ,Q3
Q1 = df4.height.quantile(.25)
Q3 = df4.height.quantile(.75)
Q1 , Q3

(5.37, 6.4799999999999995)

# Calculating IQR
IQR = Q3 - Q1
IQR

1.1099999999999994

# Calculating IQR
lower_limit = Q1 - 1.5 * (IQR)
upper_limit = Q3 + 1.5 * (IQR)
lower_limit,upper_limit

(3.705000000000001, 8.145)

#outliers
df4[(df4.height<lower_limit)|(df4.height>upper_limit)]

# data with no outliers
df_no_outliers = df4[(df4.height>lower_limit) & (df4.height < upper_limit) ]

df_no_outliers

df5 =pd.read_csv("D:/docs/dataset/archive/diabetes.csv")

df5.sample(5)

df5.shape

(768, 9)

plt.hist(df5.BMI,bins=20,rwidth=0.8)
plt.xlabel('Body Mass Index (BMI)')
plt.ylabel('Count')
plt.show()

from scipy.stats import norm
import numpy as np
plt.hist(df5.BMI,bins=20,rwidth=0.8,density=True)
plt.xlabel('BMI')
plt.ylabel('Count')
rng= np.arange(df5.BMI.min(),df5.BMI.max(),0.1)
plt.plot(rng,norm.pdf(rng,df5.BMI.mean(),df5.BMI.std()))

[<matplotlib.lines.Line2D at 0x264af9c21b0>]

df5.BMI.std()

7.8841603203754405

df5.BMI.mean()

31.992578124999998

upper_limit = df5.BMI.mean() + 3 * df5.BMI.std()
upper_limit

55.645059086126324

#Calculating lower_limit
lower_limit = df5.BMI.mean() - 3 * df5.BMI.std()
lower_limit

8.340097163873676

# checking the outliers
df5[(df5.BMI>upper_limit )| (df5.BMI <lower_limit)]

df_no_outlier_std_dev = df[(df5.BMI<upper_limit ) & (df5.BMI>lower_limit)]
df_no_outlier_std_dev.shape

(754, 9)

df5.shape[0] - df_no_outlier_std_dev.shape[0]

14

df5['zscore'] = (df5.BMI - df5.BMI.mean())/df5.BMI.std()

df5.head(5)

#outliers
df5[(df5['zscore']>3) | (df5['zscore']<-3)]

#removing outliers as per Z score
df_no_outliers = df5[(df5.zscore <3) & (df5.zscore > -3)]
df_no_outliers.head()

#checkig the number of ouliers
df5.shape[0] - df_no_outliers.shape[0]

14

df5.head()

df5['Pregnancies']

0       6
1       1
2       8
3       1
4       0
       ..
763    10
764     2
765     5
766     1
767     1
Name: Pregnancies, Length: 768, dtype: int64

from collections import Counter

Counter(df5.Pregnancies)

Counter({1: 135,
         0: 111,
         2: 103,
         3: 75,
         4: 68,
         5: 57,
         6: 50,
         7: 45,
         8: 38,
         9: 28,
         10: 24,
         11: 11,
         13: 10,
         12: 9,
         14: 2,
         15: 1,
         17: 1})

# to get the frequency
df.Pregnancies.value_counts()

Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: count, dtype: int64

#checking the type
type(df5.Pregnancies.value_counts())

pandas.core.series.Series

g1 = df5['Pregnancies'].value_counts()

g1.plot()

<Axes: xlabel='Pregnancies'>

#Visualizing through a bar chart
g1.plot(kind='bar')

<Axes: xlabel='Pregnancies'>

#Visualizing through pie chart
g1.plot(kind='pie')

<Axes: ylabel='count'>

tdf = pd.read_csv('D:/Pdf/college/titanic.csv')

tdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  156 non-null    int64  
 1   Survived     156 non-null    int64  
 2   Pclass       156 non-null    int64  
 3   Lname        156 non-null    object 
 4   Name         156 non-null    object 
 5   Sex          156 non-null    object 
 6   Age          126 non-null    float64
 7   SibSp        156 non-null    int64  
 8   Parch        156 non-null    int64  
 9   Ticket       156 non-null    object 
 10  Fare         156 non-null    float64
 11  Cabin        31 non-null     object 
 12  Embarked     155 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 16.0+ KB

#checking top 5 recordsof the dataset
tdf.head()

#Let's visualize the survivors count
sns.countplot(x=tdf.Survived)
plt.show()

# Lets check how many males and females were survived
sns.countplot(x=titanic_df.Survived,hue=titanic_df.Sex)
plt.show()

#Let's distinguish the data by passenger class
sns.countplot(x=titanic_df.Survived,hue=titanic_df.Pclass)
plt.show()

tips_df = pd.read_csv('D:/docs/downloads/tips.csv')
tips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB

#checking top 5 recordsof the dataset
tips_df.head()

#distribution of tips
sns.distplot(x=tips_df.tip, hist=True, kde=True, color='r')

<Axes: ylabel='Density'>

# to find out the range
sns.boxplot(tips_df.tip, color = 'b')

<Axes: ylabel='tip'>

fig, axes = plt.subplots(1,2, figsize =(12,6))
#distribution of tips
sns.distplot(x=tips_df.tip, hist=True, kde=True, color='r',ax=axes[0])
# to find out the range
sns.boxplot(tips_df.tip, color = 'b',ax=axes[1])
axes[0].set_title("Distribution of Tips",fontsize=15)
axes[1].set_title("Tips Range",fontsize=15)

Text(0.5, 1.0, 'Tips Range')

x = np.linspace(-1,1,101)
y1 = x
y2 = x**2
y3 = x**3
y4 = x**4

fig,ax = plt.subplots(2,1)
ax[0].plot(x,y1,color ='b')
ax[1].plot(x,y2, color='r')

[<matplotlib.lines.Line2D at 0x264b43b4b30>]

fig,ax = plt.subplots(2,2)
ax[0,0].plot(x,y1, color ='b')
ax[0,1].plot(x,y2, color='r')
ax[1,0].plot(x,y3, color='y')
ax[1,1].plot(x,y4, color='b')
fig.show()

#Load the dataset
d = pd.read_csv('C:/Users/Acer/Downloads/diamonds.csv')

d.head(20)

#checking the shape
d.shape

(53940, 11)

#filter the value where clarity is in {'SI1','VI2'}
d1 =d[d.clarity.isin(['SI1','VS2'])]

#checkig the shape after filtering
d1.shape

(25323, 11)

sns.set_style('darkgrid')

sns.countplot(x='color',data=d1)

<Axes: xlabel='color', ylabel='count'>

#checking the value counts of color
d1.color.value_counts()

color
E    4896
F    4332
G    4323
H    3918
D    3780
I    2593
J    1481
Name: count, dtype: int64

#use y to change the orientation instead of x
sns.countplot(y='cut',data=d1)

<Axes: xlabel='count', ylabel='cut'>

#checking the datatypes
d1.dtypes

Unnamed: 0      int64
carat         float64
cut            object
color          object
clarity        object
depth         float64
table         float64
price           int64
x             float64
y             float64
z             float64
dtype: object

#Providing the order
color_order = ['J','I','H','G','F','E','D']
sns.countplot(x='color',data=d1,order=color_order);
30

30

#checking the value counts
d1.color.value_counts()

color
E    4896
F    4332
G    4323
H    3918
D    3780
I    2593
J    1481
Name: count, dtype: int64

#checking the Index
d1.color.value_counts().index

Index(['E', 'F', 'G', 'H', 'D', 'I', 'J'], dtype='object', name='color')

#plotting as per index (plotting in descending order)
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index);

#plotting in ascending order
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1]);

# setting the color to Black
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1],color='lightblue');

# setting the color to Black
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1],palette='twilight');

#line width and edge color
sns.countplot(x='color',data=d1, lw=4, ec='black')

<Axes: xlabel='color', ylabel='count'>

#hatching
sns.countplot(x='color',data=d1, lw=4, ec='black',hatch='/')

<Axes: xlabel='color', ylabel='count'>

y1 = np.random.randn(70)
y2 = np.random.randn(70)
# Both y1 and y2 should be of same size
plt.scatter(y1,y2)
plt.show()

## adjust axis, if required
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(y1,y2)
plt.axis([-3,3,-3,3]) #plt.axis([Xmin,Xmax,Ymin,Ymax])
plt.show()

# using the scatter plot with comman axis
x = np.arange(70)
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(x,y1, marker='o', label ='y1 with x')
plt.scatter(x,y2, marker='v', label ='y2 with x')
plt.legend(loc='upper left')

<matplotlib.legend.Legend at 0x264b5293ec0>

#assigning the colors
x = np.arange(70)
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(x,y1, marker='o', label ='y1 with x',color='g')
plt.scatter(x,y2, marker='v', label ='y2 with x',color='b')
plt.legend(loc='upper left')

<matplotlib.legend.Legend at 0x264b7b6c320>

!pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.56-py2.py3-none-any.whl.metadata (5.8 kB)
Requirement already satisfied: pandas>=1.3.0 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.2.2)
Requirement already satisfied: numpy>=1.16.5 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (1.26.4)
Requirement already satisfied: requests>=2.31 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.32.2)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Requirement already satisfied: platformdirs>=2.0.0 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (3.10.0)
Requirement already satisfied: pytz>=2022.5 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2024.1)
Requirement already satisfied: frozendict>=2.3.4 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.4.2)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.9.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
      --------------------------------------- 0.0/3.0 MB ? eta -:--:--
      --------------------------------------- 0.0/3.0 MB ? eta -:--:--
     - -------------------------------------- 0.1/3.0 MB 1.8 MB/s eta 0:00:02
     -- ------------------------------------- 0.2/3.0 MB 1.2 MB/s eta 0:00:03
     -- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
     -- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
     -- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
     ---- ----------------------------------- 0.3/3.0 MB 996.0 kB/s eta 0:00:03
     ----- ---------------------------------- 0.4/3.0 MB 983.0 kB/s eta 0:00:03
     ----- ---------------------------------- 0.4/3.0 MB 995.2 kB/s eta 0:00:03
     ------ --------------------------------- 0.5/3.0 MB 1.1 MB/s eta 0:00:03
     ------- -------------------------------- 0.6/3.0 MB 1.1 MB/s eta 0:00:03
     -------- ------------------------------- 0.6/3.0 MB 1.2 MB/s eta 0:00:03
     -------- ------------------------------- 0.6/3.0 MB 1.2 MB/s eta 0:00:03
     --------- ------------------------------ 0.7/3.0 MB 1.1 MB/s eta 0:00:03
     --------- ------------------------------ 0.7/3.0 MB 1.1 MB/s eta 0:00:03
     ----------- ---------------------------- 0.8/3.0 MB 1.1 MB/s eta 0:00:02
     ----------- ---------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:03
     ------------ --------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:02
     ------------ --------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:02
     ------------- -------------------------- 1.0/3.0 MB 1.1 MB/s eta 0:00:02
     ------------- -------------------------- 1.0/3.0 MB 1.0 MB/s eta 0:00:02
     -------------- ------------------------- 1.1/3.0 MB 1.1 MB/s eta 0:00:02
     --------------- ------------------------ 1.2/3.0 MB 1.1 MB/s eta 0:00:02
     --------------- ------------------------ 1.2/3.0 MB 1.1 MB/s eta 0:00:02
     --------------- ------------------------ 1.2/3.0 MB 982.6 kB/s eta 0:00:02
     ---------------- ----------------------- 1.2/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------- ---------------------- 1.3/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------- ---------------------- 1.3/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------ --------------------- 1.4/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------ --------------------- 1.4/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------- -------------------- 1.5/3.0 MB 1.0 MB/s eta 0:00:02
     -------------------- ------------------- 1.5/3.0 MB 1.0 MB/s eta 0:00:02
     -------------------- ------------------- 1.6/3.0 MB 1.0 MB/s eta 0:00:02
     -------------------- ------------------- 1.6/3.0 MB 1.0 MB/s eta 0:00:02
     --------------------- ------------------ 1.6/3.0 MB 992.2 kB/s eta 0:00:02
     --------------------- ------------------ 1.6/3.0 MB 992.2 kB/s eta 0:00:02
     ---------------------- ----------------- 1.7/3.0 MB 991.5 kB/s eta 0:00:02
     ----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------------ --------------- 1.9/3.0 MB 966.5 kB/s eta 0:00:02
     ------------------------- -------------- 1.9/3.0 MB 977.7 kB/s eta 0:00:02
     ------------------------- -------------- 2.0/3.0 MB 965.1 kB/s eta 0:00:02
     -------------------------- ------------- 2.0/3.0 MB 980.5 kB/s eta 0:00:02
     --------------------------- ------------ 2.1/3.0 MB 975.5 kB/s eta 0:00:01
     --------------------------- ------------ 2.1/3.0 MB 968.5 kB/s eta 0:00:01
     ---------------------------- ----------- 2.1/3.0 MB 966.4 kB/s eta 0:00:01
     ---------------------------- ----------- 2.2/3.0 MB 971.3 kB/s eta 0:00:01
     ----------------------------- ---------- 2.2/3.0 MB 967.1 kB/s eta 0:00:01
     ----------------------------- ---------- 2.2/3.0 MB 967.1 kB/s eta 0:00:01
     ------------------------------ --------- 2.3/3.0 MB 957.2 kB/s eta 0:00:01
     ------------------------------ --------- 2.3/3.0 MB 945.0 kB/s eta 0:00:01
     ------------------------------- -------- 2.4/3.0 MB 968.4 kB/s eta 0:00:01
     ------------------------------- -------- 2.4/3.0 MB 968.4 kB/s eta 0:00:01
     -------------------------------- ------- 2.5/3.0 MB 952.9 kB/s eta 0:00:01
     -------------------------------- ------- 2.5/3.0 MB 945.5 kB/s eta 0:00:01
     --------------------------------- ------ 2.5/3.0 MB 953.8 kB/s eta 0:00:01
     --------------------------------- ------ 2.5/3.0 MB 953.8 kB/s eta 0:00:01
     ---------------------------------- ----- 2.6/3.0 MB 943.5 kB/s eta 0:00:01
     ----------------------------------- ---- 2.7/3.0 MB 953.3 kB/s eta 0:00:01
     ----------------------------------- ---- 2.7/3.0 MB 936.1 kB/s eta 0:00:01
     ------------------------------------ --- 2.7/3.0 MB 938.6 kB/s eta 0:00:01
     ------------------------------------ --- 2.7/3.0 MB 938.6 kB/s eta 0:00:01
     ------------------------------------- -- 2.8/3.0 MB 931.8 kB/s eta 0:00:01
     ------------------------------------- -- 2.9/3.0 MB 938.9 kB/s eta 0:00:01
     -------------------------------------- - 2.9/3.0 MB 941.2 kB/s eta 0:00:01
     -------------------------------------- - 2.9/3.0 MB 941.2 kB/s eta 0:00:01
     ---------------------------------------  3.0/3.0 MB 929.7 kB/s eta 0:00:01
     ---------------------------------------  3.0/3.0 MB 924.3 kB/s eta 0:00:01
     ---------------------------------------- 3.0/3.0 MB 923.4 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Requirement already satisfied: beautifulsoup4>=4.11.1 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\acer\anaconda3\lib\site-packages (from beautifulsoup4>=4.11.1->yfinance) (2.5)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=1.3.0->yfinance) (2.9.0.post0)
Requirement already satisfied: tzdata>=2022.7 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=1.3.0->yfinance) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2024.12.14)
Requirement already satisfied: six>=1.5 in c:\users\acer\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.3.0->yfinance) (1.16.0)
Downloading yfinance-0.2.56-py2.py3-none-any.whl (113 kB)
   ---------------------------------------- 0.0/113.7 kB ? eta -:--:--
   --- ------------------------------------ 10.2/113.7 kB ? eta -:--:--
   -------------- ------------------------ 41.0/113.7 kB 653.6 kB/s eta 0:00:01
   ------------------------------- ------- 92.2/113.7 kB 871.5 kB/s eta 0:00:01
   -------------------------------------- 113.7/113.7 kB 663.1 kB/s eta 0:00:00
Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Building wheels for collected packages: peewee
  Building wheel for peewee (pyproject.toml): started
  Building wheel for peewee (pyproject.toml): finished with status 'done'
  Created wheel for peewee: filename=peewee-3.17.9-py3-none-any.whl size=139127 sha256=d4e0fc6219e187c4aedf623d122849492ef79b7793b740305def523e358a9d50
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\43\ef\2d\2c51d496bf084945ffdf838b4cc8767b8ba1cc20eb41588831
Successfully built peewee
Installing collected packages: peewee, multitasking, yfinance
Successfully installed multitasking-0.0.11 peewee-3.17.9 yfinance-0.2.56

#pip install --upgrade pandas_datareader

Requirement already satisfied: pandas_datareader in c:\users\acer\anaconda3\lib\site-packages (0.10.0)
Requirement already satisfied: lxml in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (5.2.1)
Requirement already satisfied: pandas>=0.23 in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (2.2.2)
Requirement already satisfied: requests>=2.19.0 in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (2.32.2)
Requirement already satisfied: numpy>=1.26.0 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2024.12.14)
Requirement already satisfied: six>=1.5 in c:\users\acer\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.23->pandas_datareader) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

# extract data from various Internet sources into a pandas DataFrame
#import pandas_datareader as pdr
import yfinance as yf

#extracting Data
stocks = ['GOOG', 'AMZN']
#data = pdr.get_data_yahoo(stocks, start = '2022-01-01')['Close']
data = yf.download(stocks, start='2022-01-01')['Close']
data.head()

YF.download() has changed argument auto_adjust default to True

[*********************100%***********************]  2 of 2 completed

1 Failed download:
['GOOG']: ConnectionError(ReadTimeoutError("HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out."))

from matplotlib import rcParams
rcParams['figure.figsize'] = 10,6
plt.plot(data.AMZN)
plt.plot(data.GOOG)
plt.grid(True, color='k', linestyle=':')
plt.title("Amazon & Google Prices")
plt.xlabel("Date")

Text(0.5, 0, 'Date')

from matplotlib import rcParams
rcParams['figure.figsize'] = 10,6
plt.plot(data.AMZN)
plt.plot(data.GOOG)
plt.grid(True, color='k', linestyle=':')
plt.title("Amazon & Google Prices")
plt.xlabel("Date")
plt.legend(['GOOG','AMZN'],loc =2)

<matplotlib.legend.Legend at 0x264ba57f260>

#loading the Dataset
tips = pd.read_csv('D:/docs/downloads/tips.csv')

tips.head()

sns.boxplot(tips['size'])

<Axes: ylabel='size'>

sns.boxplot(tips['total_bill'])

<Axes: ylabel='total_bill'>

sns.boxplot(x='sex', y ='total_bill', data=tips)

<Axes: xlabel='sex', ylabel='total_bill'>

sns.boxplot(x='day', y ='total_bill', data=tips)

<Axes: xlabel='day', ylabel='total_bill'>

sns.boxplot(x='day', y ='total_bill', data=tips,hue='sex')

<Axes: xlabel='day', ylabel='total_bill'>

sns.boxplot(x='day', y ='total_bill', data=tips,hue='sex',palette ='husl')

<Axes: xlabel='day', ylabel='total_bill'>

sns.boxplot(x='day', y ='total_bill', data=tips,hue='smoker',palette='coolwarm')

<Axes: xlabel='day', ylabel='total_bill'>

sns.boxplot(x='day', y ='total_bill', data=tips,hue='time')

<Axes: xlabel='day', ylabel='total_bill'>

iris = pd.read_csv('C:/Users/Acer/Downloads/iris.csv')

sns.jointplot(x='total_bill', y='tip', data=tips)

<seaborn.axisgrid.JointGrid at 0x264bb1a0f80>

iris.columns

Index(['150', '4', 'setosa', 'versicolor', 'virginica'], dtype='object')

# sns.jointplot(x='SepalLengthCm', y='SepalWidthCm', data=iris)
# sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg',color='green')

# adding regression lines
sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg')

<seaborn.axisgrid.JointGrid at 0x264bdd90170>

sns.barplot(x='day', y ='tip',data=tips)

<Axes: xlabel='day', ylabel='tip'>

sns.barplot(x='day', y ='total_bill',data=tips)

<Axes: xlabel='day', ylabel='total_bill'>

sns.barplot(x='day', y ='total_bill',data=tips,hue='sex')

<Axes: xlabel='day', ylabel='total_bill'>

sns.barplot(x='day', y ='total_bill',data=tips,hue='sex', palette='winter_r')

<Axes: xlabel='day', ylabel='total_bill'>

sns.barplot(x='day', y ='total_bill',data=tips,hue='smoker')

<Axes: xlabel='day', ylabel='total_bill'>

sns.barplot(x='total_bill', y ='day' , data = tips, palette ='spring')

<Axes: xlabel='total_bill', ylabel='day'>

sns.barplot(x='day', y ='tip' , data = tips, palette='spring',order=['Thur','Fri','Sat','Sun'])

<Axes: xlabel='day', ylabel='tip'>

sns.barplot(x='smoker', y ='tip' , data = tips,hue='sex')

<Axes: xlabel='smoker', ylabel='tip'>

# #Laod the dataset
# df = pd.read_csv('mtcars.csv')

iris = pd.read_csv('C:/Users/Acer/Downloads/iris.csv')
iris.head()

sns.pairplot(iris)

<seaborn.axisgrid.PairGrid at 0x264bb16d760>

sns.pairplot(iris,hue='versicolor')

<seaborn.axisgrid.PairGrid at 0x264b9e68470>

# sns.pairplot(iris,hue='Species')

# "D:\docs\downloads\cars.csv"
#Laod the dataset
df6 = pd.read_csv('D:/docs/downloads/cars.csv')

# Viewing first 5 observations
df.head()

df6.plot.scatter(x='Weight', y='MPG.highway')

<Axes: xlabel='Weight', ylabel='MPG.highway'>

sns.regplot(x='Weight', y='MPG.highway', data=df6)

<Axes: xlabel='Weight', ylabel='MPG.highway'>

#changing color
sns.regplot(x='Weight', y='MPG.highway',data=df6, color='g')

<Axes: xlabel='Weight', ylabel='MPG.highway'>

sns.lmplot(x='Weight', y='MPG.highway',data=df6)

<seaborn.axisgrid.FacetGrid at 0x264c3fd2c60>

sns.lmplot(x='Weight', y='MPG.highway',hue='Origin', data=df6)

<seaborn.axisgrid.FacetGrid at 0x264c4fca390>

sns.lmplot(x='Weight', y='MPG.highway',hue='Origin',palette='Set1', data=df6)

<seaborn.axisgrid.FacetGrid at 0x264c3fc2ff0>

sns.lmplot(x='Weight', y='MPG.highway',hue='Origin',markers=['+','o'],palette='Set1', data=df6)

<seaborn.axisgrid.FacetGrid at 0x264c5154440>

# df7 = pd.read_csv('')

# df7.columns

Index(['A_id', 'Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness',
       'Ripeness', 'Acidity', 'Quality'],
      dtype='object')

# df7.head()

df7.dtypes

A_id           float64
Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity         object
Quality         object
dtype: object

# sns.heatmap(df7.corr())

# sns.heatmap(df.corr(), annot =True)

# plt.figure(figsize=(10,5))
# sns.heatmap(df.corr(), annot =True)

	Row ID	Order ID	Order Date	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	...	Category	Sub-Category	Product Name	Sales	Quantity	Profit	Returns	Payment Mode	ind1	ind2
0	4918	CA-2019-160304	2019-01-01	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	73.94	1	28.2668	NaN	Online	NaN	NaN
1	4919	CA-2019-160304	2019-01-02	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	173.94	3	38.2668	NaN	Online	NaN	NaN
2	4920	CA-2019-160304	2019-01-02	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	...	Technology	Phones	GE 30522EE2	231.98	2	67.2742	NaN	Cards	NaN	NaN
3	3074	CA-2019-125206	2019-01-03	2019-01-05	First Class	LR-16915	Lena Radford	Consumer	United States	Los Angeles	...	Office Supplies	Storage	Recycled Steel Personal File for Hanging File ...	114.46	2	28.6150	NaN	Online	NaN	NaN
4	8604	US-2019-116365	2019-01-03	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	...	Technology	Accessories	Imation Clip USB flash drive - 8 GB	30.08	2	-5.2640	NaN	Online	NaN	NaN

	Order ID	Order Date	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	State	...	Category	Sub-Category	Product Name	Sales	Quantity	Profit	Returns	Payment Mode	ind1	ind2
0	CA-2019-160304	2019-01-01	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	73.94	1	28.2668	NaN	Online	NaN	NaN
1	CA-2019-160304	2019-01-02	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	173.94	3	38.2668	NaN	Online	NaN	NaN
2	CA-2019-160304	2019-01-02	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	...	Technology	Phones	GE 30522EE2	231.98	2	67.2742	NaN	Cards	NaN	NaN
3	CA-2019-125206	2019-01-03	2019-01-05	First Class	LR-16915	Lena Radford	Consumer	United States	Los Angeles	California	...	Office Supplies	Storage	Recycled Steel Personal File for Hanging File ...	114.46	2	28.6150	NaN	Online	NaN	NaN
4	US-2019-116365	2019-01-03	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	Texas	...	Technology	Accessories	Imation Clip USB flash drive - 8 GB	30.08	2	-5.2640	NaN	Online	NaN	NaN

	Order ID	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	State	Region	...	Category	Sub-Category	Product Name	Sales	Quantity	Profit	Returns	Payment Mode	ind1	ind2
0	CA-2019-160304	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	East	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	73.94	1	28.2668	NaN	Online	NaN	NaN
1	CA-2019-160304	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	East	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	173.94	3	38.2668	NaN	Online	NaN	NaN
2	CA-2019-160304	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	East	...	Technology	Phones	GE 30522EE2	231.98	2	67.2742	NaN	Cards	NaN	NaN
3	CA-2019-125206	2019-01-05	First Class	LR-16915	Lena Radford	Consumer	United States	Los Angeles	California	West	...	Office Supplies	Storage	Recycled Steel Personal File for Hanging File ...	114.46	2	28.6150	NaN	Online	NaN	NaN
4	US-2019-116365	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	Texas	Central	...	Technology	Accessories	Imation Clip USB flash drive - 8 GB	30.08	2	-5.2640	NaN	Online	NaN	NaN

	Order ID	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	State	Region	...	Category	Sub-Category	Product Name	Sales	Quantity	Profit	Returns	Payment Mode	ind1	ind2
0	CA-2019-160304	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	East	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	73.94	1	28.2668	NaN	Online	NaN	NaN
2	CA-2019-160304	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	East	...	Technology	Phones	GE 30522EE2	231.98	2	67.2742	NaN	Cards	NaN	NaN
3	CA-2019-125206	2019-01-05	First Class	LR-16915	Lena Radford	Consumer	United States	Los Angeles	California	West	...	Office Supplies	Storage	Recycled Steel Personal File for Hanging File ...	114.46	2	28.6150	NaN	Online	NaN	NaN
4	US-2019-116365	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	Texas	Central	...	Technology	Accessories	Imation Clip USB flash drive - 8 GB	30.08	2	-5.2640	NaN	Online	NaN	NaN
5	US-2019-116365	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	Texas	Central	...	Technology	Accessories	WD My Passport Ultra 1TB Portable External Har...	165.60	3	-6.2100	NaN	Online	NaN	NaN

	Order ID	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	State	Region	...	Category	Sub-Category	Product Name	Sales	Quantity	Profit	Returns	Payment Mode	ind1	ind2
0	CA-2019-160304	2019-01-07	Standard Class	BM-11575	Brendan Murry	Corporate	United States	Gaithersburg	Maryland	East	...	Furniture	Bookcases	Bush Westfield Collection Bookcases, Medium Ch...	73.94	1	28.2668	NaN	Online	NaN	NaN
4	US-2019-116365	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	Texas	Central	...	Technology	Accessories	Imation Clip USB flash drive - 8 GB	30.08	2	-5.2640	NaN	Online	NaN	NaN
5	US-2019-116365	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	Texas	Central	...	Technology	Accessories	WD My Passport Ultra 1TB Portable External Har...	165.60	3	-6.2100	NaN	Online	NaN	NaN
6	US-2019-116365	2019-01-08	Standard Class	CA-12310	Christine Abelman	Corporate	United States	San Antonio	Texas	Central	...	Technology	Phones	AT&T 17929 Lendline Telephone	180.96	5	13.5720	NaN	Cards	NaN	NaN
7	CA-2019-105207	2019-01-08	Standard Class	BO-11350	Bill Overfelt	Corporate	United States	Broken Arrow	Oklahoma	Central	...	Furniture	Tables	Hon Practical Foundations 30 x 60 Training Tab...	1592.85	7	350.4270	NaN	COD	NaN	NaN

	city	rank	score1	score2
0	Delhi	3rd	50	53
1	Bangalore	1st	74	80
2	Hyderabad	1st	59	90
3	Pune	2nd	67	51
4	Bangalore	5th	63	73
5	Pune	3rd	48	100
6	Kolkata	3rd	32	89
7	Faridabad	4th	48	65
8	Hyderabad	5th	54	72
9	Hyderabad	4th	80	84
10	Hyderabad	4th	72	98
11	Hyderabad	5th	75	67
12	Pune	1st	61	54
13	Faridabad	5th	58	100
14	Kolkata	1st	70	51
15	Bangalore	2nd	35	87
16	Hyderabad	1st	40	100
17	Pune	1st	51	67
18	Delhi	1st	47	86
19	Kolkata	3rd	77	94
20	Jaipur	3rd	76	57
21	Kolkata	2nd	37	54
22	Mumbai	3rd	45	59
23	Jaipur	2nd	41	66
24	Noida	2nd	30	70
25	Bangalore	4th	70	82
26	Bangalore	4th	60	76
27	Chennai	5th	41	92
28	Jaipur	1st	61	65
29	Jaipur	4th	73	76
30	Jaipur	5th	41	66
31	Mumbai	1st	32	97
32	Jaipur	3rd	66	65
33	Kolkata	1st	71	63
34	Kolkata	5th	70	65
35	Mumbai	1st	40	54
36	Delhi	4th	57	83
37	Pune	2nd	66	94
38	Chennai	2nd	43	96
39	Hyderabad	3rd	65	82
40	Mumbai	4th	61	76
41	Bangalore	2nd	73	84
42	Pune	4th	65	56
43	Jaipur	5th	39	69
44	Jaipur	1st	52	88
45	Bangalore	3rd	72	53
46	Faridabad	2nd	67	62
47	Hyderabad	1st	40	55
48	Chennai	5th	70	51
49	Kolkata	3rd	44	59
50	Mumbai	1st	80	74
51	Faridabad	3rd	74	62
52	Delhi	2nd	79	85
53	Mumbai	2nd	58	51
54	Chennai	1st	62	88
55	Faridabad	3rd	32	96
56	Faridabad	3rd	36	76
57	Noida	3rd	52	55
58	Faridabad	5th	43	52
59	Kolkata	5th	38	76

	Row ID	Ship Date	Postal Code	Sales	Quantity	Discount	Profit
count	9994.000000	9994	9983.000000	9994.000000	9994.000000	9994.000000	9994.000000
mean	4997.500000	2021-05-04 04:17:20.304182528	55245.233297	229.858001	3.789574	0.156203	28.656896
min	1.000000	2019-01-07 00:00:00	1040.000000	0.444000	1.000000	0.000000	-6599.978000
25%	2499.250000	2020-05-27 00:00:00	23223.000000	17.280000	2.000000	0.000000	1.728750
50%	4997.500000	2021-06-29 00:00:00	57103.000000	54.490000	3.000000	0.200000	8.666500
75%	7495.750000	2022-05-18 00:00:00	90008.000000	209.940000	5.000000	0.200000	29.364000
max	9994.000000	2023-01-05 00:00:00	99301.000000	22638.480000	14.000000	0.800000	8399.976000
std	2885.163629	NaN	32038.715955	623.245101	2.225110	0.206452	234.260108

	name	height
0	Mohan	1.82
1	Maria	1.78
2	Deepak	1.77
3	Kunal	1.77
4	Piyush	1.60
5	Avinash	1.89
6	Lisa	1.82
7	Smita	1.89
8	Tanu	1.89
9	Khusboo	1.45
10	Nishant	1.95
11	Johnson	1.42
12	Donald	2.01
13	Rakesh	2.01
14	Pritvi	1.44
15	Roy	1.92
16	Ashish	1.71
17	Abhishek	1.84
18	Jassi	1.65
19	Nikita	1.61
20	Aman	1.78
21	Divya	1.52
22	Sneha	1.88
23	Ritu	1.78
24	Anuj	1.73
25	Chetan	1.50
26	Saurabh	1.57
27	Rohit	2.09
28	Gaurav	1.43
29	Meena	1.92
30	Seema	1.50
31	Priya	1.85
32	Alok	1.89
33	Sanya	1.59
34	Nilesh	1.93
35	Vikas	1.67
36	Ankita	1.76
37	Suraj	2.10
38	Bhavna	1.48
39	Kiran	2.09
40	Simran	1.52
41	Radhika	1.42
42	Vivek	1.52
43	Pranav	1.92
44	Hemant	1.54
45	Aditi	1.83
46	Komal	1.99
47	Tarun	1.48
48	Neha	1.73
49	Ishaan	1.92

	height
count	50.00000
mean	1.74340
std	0.20022
min	1.42000
25%	1.54750
50%	1.77500
75%	1.89000
max	2.10000

	height
count	50.000000
mean	8.324000
std	7.869812
min	4.620000
25%	5.370000
50%	5.830000
75%	6.480000
max	37.800000

	name	height
17	Abhishek	34.14
31	Priya	23.60
34	Nilesh	37.80
42	Vivek	26.07
45	Aditi	33.73

	name	height
0	Mohan	6.83
1	Maria	4.77
2	Deepak	6.01
3	Kunal	6.28
4	Piyush	5.76
5	Avinash	6.84
6	Lisa	5.90
7	Smita	6.96
8	Tanu	6.82
9	Khusboo	6.36
10	Nishant	6.96
11	Johnson	5.10
12	Donald	5.37
13	Rakesh	5.90
14	Pritvi	4.83
15	Roy	6.35
16	Ashish	5.82
18	Jassi	5.27
19	Nikita	5.72
20	Aman	5.82
21	Divya	4.62
22	Sneha	5.70
23	Ritu	4.80
24	Anuj	6.32
25	Chetan	5.37
26	Saurabh	5.94
27	Rohit	5.65
28	Gaurav	5.45
29	Meena	4.78
30	Seema	5.90
32	Alok	5.19
33	Sanya	5.73
35	Vikas	5.69
36	Ankita	5.39
37	Suraj	6.52
38	Bhavna	5.84
39	Kiran	5.36
40	Simran	6.33
41	Radhika	5.67
43	Pranav	5.08
44	Hemant	5.09
46	Komal	6.08
47	Tarun	5.32
48	Neha	6.84
49	Ishaan	6.53

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age
212	7	179	95	31	0	34.2	0.164	60
499	6	154	74	32	193	29.3	0.839	39
488	4	99	72	17	0	25.6	0.294	28
268	0	102	52	0	0	25.1	0.078	21
454	2	100	54	28	105	37.8	0.498	24

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
9	8	125	96	0	0	0.0	0.232	54	1
49	7	105	0	0	0	0.0	0.305	24	0
60	2	84	0	0	0	0.0	0.304	21	0
81	2	74	0	0	0	0.0	0.102	22	0
145	0	102	75	23	0	0.0	0.572	21	0
177	0	129	110	46	130	67.1	0.319	26	1
371	0	118	64	23	89	0.0	1.731	21	0
426	0	94	0	0	0	0.0	0.256	25	0
445	0	180	78	63	14	59.4	2.420	25	1
494	3	80	0	0	0	0.0	0.174	22	0
522	6	114	0	0	0	0.0	0.189	26	0
673	3	123	100	35	240	57.3	0.880	22	0
684	5	136	82	0	0	0.0	0.640	69	0
706	10	115	0	0	0	0.0	0.261	30	1

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome	zscore
0	6	148	72	35	0	33.6	0.627	50	1	0.203880
1	1	85	66	29	0	26.6	0.351	31	0	-0.683976
2	8	183	64	0	0	23.3	0.672	32	1	-1.102537
3	1	89	66	23	94	28.1	0.167	21	0	-0.493721
4	0	137	40	35	168	43.1	2.288	33	1	1.408828

	PassengerId	Survived	Pclass	Lname	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund	Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings	Mrs. John Bradley (Florence Briggs Thayer)	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen	Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle	Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen	Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	Unnamed: 0	carat	cut	color	clarity	depth	table	price	x	y	z
0	1	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	2	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	3	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	4	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	5	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75
5	6	0.24	Very Good	J	VVS2	62.8	57.0	336	3.94	3.96	2.48
6	7	0.24	Very Good	I	VVS1	62.3	57.0	336	3.95	3.98	2.47
7	8	0.26	Very Good	H	SI1	61.9	55.0	337	4.07	4.11	2.53
8	9	0.22	Fair	E	VS2	65.1	61.0	337	3.87	3.78	2.49
9	10	0.23	Very Good	H	VS1	59.4	61.0	338	4.00	4.05	2.39
10	11	0.30	Good	J	SI1	64.0	55.0	339	4.25	4.28	2.73
11	12	0.23	Ideal	J	VS1	62.8	56.0	340	3.93	3.90	2.46
12	13	0.22	Premium	F	SI1	60.4	61.0	342	3.88	3.84	2.33
13	14	0.31	Ideal	J	SI2	62.2	54.0	344	4.35	4.37	2.71
14	15	0.20	Premium	E	SI2	60.2	62.0	345	3.79	3.75	2.27
15	16	0.32	Premium	E	I1	60.9	58.0	345	4.38	4.42	2.68
16	17	0.30	Ideal	I	SI2	62.0	54.0	348	4.31	4.34	2.68
17	18	0.30	Good	J	SI1	63.4	54.0	351	4.23	4.29	2.70
18	19	0.30	Good	J	SI1	63.8	56.0	351	4.23	4.26	2.71
19	20	0.30	Very Good	J	SI1	62.7	59.0	351	4.21	4.27	2.66

Ticker	AMZN	GOOG
Date
2022-01-03	170.404495	NaN
2022-01-04	167.522003	NaN
2022-01-05	164.356995	NaN
2022-01-06	163.253998	NaN
2022-01-07	162.554001	NaN

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome	zscore
9	8	125	96	0	0	0.0	0.232	54	1	-4.057829
49	7	105	0	0	0	0.0	0.305	24	0	-4.057829
60	2	84	0	0	0	0.0	0.304	21	0	-4.057829
81	2	74	0	0	0	0.0	0.102	22	0	-4.057829
145	0	102	75	23	0	0.0	0.572	21	0	-4.057829
177	0	129	110	46	130	67.1	0.319	26	1	4.452906
371	0	118	64	23	89	0.0	1.731	21	0	-4.057829
426	0	94	0	0	0	0.0	0.256	25	0	-4.057829
445	0	180	78	63	14	59.4	2.420	25	1	3.476264
494	3	80	0	0	0	0.0	0.174	22	0	-4.057829
522	6	114	0	0	0	0.0	0.189	26	0	-4.057829
673	3	123	100	35	240	57.3	0.880	22	0	3.209907
684	5	136	82	0	0	0.0	0.640	69	0	-4.057829
706	10	115	0	0	0	0.0	0.261	30	1	-4.057829

	150	4	setosa	versicolor
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	id	Manufacturer	Model	Type	Min.Price	Price	Max.Price	MPG.city	MPG.highway	AirBags	...	Passengers	Length	Wheelbase	Width	Turn.circle	Rear.seat.room	Luggage.room	Weight	Origin	Make
0	1	Acura	Integra	Small	12.9	15.9	18.8	25	31	NaN	...	5	177	102	68	37	26.5	11.0	2705	non-USA	Acura Integra
1	2	Acura	Legend	Midsize	29.2	33.9	38.7	18	25	Driver & Passenger	...	5	195	115	71	38	30.0	15.0	3560	non-USA	Acura Legend
2	3	Audi	90	Compact	25.9	29.1	32.3	20	26	Driver only	...	5	180	102	67	37	28.0	14.0	3375	non-USA	Audi 90
3	4	Audi	100	Midsize	30.8	37.7	44.6	19	26	NaN	...	6	193	106	70	37	31.0	17.0	3405	non-USA	Audi 100
4	5	BMW	535i	Midsize	23.7	30.0	36.2	22	30	Driver only	...	4	186	109	69	39	27.0	13.0	3640	non-USA	BMW 535i

	A_id	Size	Weight	Sweetness	Crunchiness	Juiciness	Ripeness	Acidity	Quality
0	0.0	-3.970049	-2.512336	5.346330	-1.012009	1.844900	0.329840	-0.491590483	good
1	1.0	-1.195217	-2.839257	3.664059	1.588232	0.853286	0.867530	-0.722809367	good
2	2.0	-0.292024	-1.351282	-1.738429	-0.342616	2.838636	-0.038033	2.621636473	bad
3	3.0	-0.657196	-2.271627	1.324874	-0.097875	3.637970	-3.413761	0.790723217	good
4	4.0	1.364217	-1.296612	-0.384658	-0.553006	3.030874	-1.303849	0.501984036	good

1.2 Outliers detection using IQR¶

	150	4	setosa	versicolor
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2