In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
In [27]:
#reading file 
df1 = pd.read_excel('C:/Users/Acer/Downloads/Project/SuperStore Sales DataSet.xlsx')
In [31]:
df1.head()
Out[31]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City ... Category Sub-Category Product Name Sales Quantity Profit Returns Payment Mode ind1 ind2
0 4918 CA-2019-160304 2019-01-01 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 73.94 1 28.2668 NaN Online NaN NaN
1 4919 CA-2019-160304 2019-01-02 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 173.94 3 38.2668 NaN Online NaN NaN
2 4920 CA-2019-160304 2019-01-02 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg ... Technology Phones GE 30522EE2 231.98 2 67.2742 NaN Cards NaN NaN
3 3074 CA-2019-125206 2019-01-03 2019-01-05 First Class LR-16915 Lena Radford Consumer United States Los Angeles ... Office Supplies Storage Recycled Steel Personal File for Hanging File ... 114.46 2 28.6150 NaN Online NaN NaN
4 8604 US-2019-116365 2019-01-03 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio ... Technology Accessories Imation Clip USB flash drive - 8 GB 30.08 2 -5.2640 NaN Online NaN NaN

5 rows × 23 columns

In [33]:
#Dropping Multiple Rows
df1.drop('Row ID',axis=1,inplace=True)
In [35]:
df1.head()
Out[35]:
Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City State ... Category Sub-Category Product Name Sales Quantity Profit Returns Payment Mode ind1 ind2
0 CA-2019-160304 2019-01-01 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 73.94 1 28.2668 NaN Online NaN NaN
1 CA-2019-160304 2019-01-02 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 173.94 3 38.2668 NaN Online NaN NaN
2 CA-2019-160304 2019-01-02 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland ... Technology Phones GE 30522EE2 231.98 2 67.2742 NaN Cards NaN NaN
3 CA-2019-125206 2019-01-03 2019-01-05 First Class LR-16915 Lena Radford Consumer United States Los Angeles California ... Office Supplies Storage Recycled Steel Personal File for Hanging File ... 114.46 2 28.6150 NaN Online NaN NaN
4 US-2019-116365 2019-01-03 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas ... Technology Accessories Imation Clip USB flash drive - 8 GB 30.08 2 -5.2640 NaN Online NaN NaN

5 rows × 22 columns

In [41]:
# Dropping the multiple columns togethor
df1.drop(['Order Date'],axis=1,inplace=True)
In [43]:
df1.head()
Out[43]:
Order ID Ship Date Ship Mode Customer ID Customer Name Segment Country City State Region ... Category Sub-Category Product Name Sales Quantity Profit Returns Payment Mode ind1 ind2
0 CA-2019-160304 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland East ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 73.94 1 28.2668 NaN Online NaN NaN
1 CA-2019-160304 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland East ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 173.94 3 38.2668 NaN Online NaN NaN
2 CA-2019-160304 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland East ... Technology Phones GE 30522EE2 231.98 2 67.2742 NaN Cards NaN NaN
3 CA-2019-125206 2019-01-05 First Class LR-16915 Lena Radford Consumer United States Los Angeles California West ... Office Supplies Storage Recycled Steel Personal File for Hanging File ... 114.46 2 28.6150 NaN Online NaN NaN
4 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Accessories Imation Clip USB flash drive - 8 GB 30.08 2 -5.2640 NaN Online NaN NaN

5 rows × 21 columns

In [45]:
# Drop Rows (axis=0 for rows)
df1.drop(1,axis=0,inplace=True)
df1.head()
Out[45]:
Order ID Ship Date Ship Mode Customer ID Customer Name Segment Country City State Region ... Category Sub-Category Product Name Sales Quantity Profit Returns Payment Mode ind1 ind2
0 CA-2019-160304 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland East ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 73.94 1 28.2668 NaN Online NaN NaN
2 CA-2019-160304 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland East ... Technology Phones GE 30522EE2 231.98 2 67.2742 NaN Cards NaN NaN
3 CA-2019-125206 2019-01-05 First Class LR-16915 Lena Radford Consumer United States Los Angeles California West ... Office Supplies Storage Recycled Steel Personal File for Hanging File ... 114.46 2 28.6150 NaN Online NaN NaN
4 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Accessories Imation Clip USB flash drive - 8 GB 30.08 2 -5.2640 NaN Online NaN NaN
5 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Accessories WD My Passport Ultra 1TB Portable External Har... 165.60 3 -6.2100 NaN Online NaN NaN

5 rows × 21 columns

In [47]:
# Dropping Multiple Rows
df1.drop([2,3],axis =0,inplace=True)
df1.head()
Out[47]:
Order ID Ship Date Ship Mode Customer ID Customer Name Segment Country City State Region ... Category Sub-Category Product Name Sales Quantity Profit Returns Payment Mode ind1 ind2
0 CA-2019-160304 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland East ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 73.94 1 28.2668 NaN Online NaN NaN
4 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Accessories Imation Clip USB flash drive - 8 GB 30.08 2 -5.2640 NaN Online NaN NaN
5 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Accessories WD My Passport Ultra 1TB Portable External Har... 165.60 3 -6.2100 NaN Online NaN NaN
6 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Phones AT&T 17929 Lendline Telephone 180.96 5 13.5720 NaN Cards NaN NaN
7 CA-2019-105207 2019-01-08 Standard Class BO-11350 Bill Overfelt Corporate United States Broken Arrow Oklahoma Central ... Furniture Tables Hon Practical Foundations 30 x 60 Training Tab... 1592.85 7 350.4270 NaN COD NaN NaN

5 rows × 21 columns

In [49]:
import random

cities = ["Faridabad", "Delhi", "Noida", "Mumbai", "Bangalore", "Chennai", "Hyderabad", "Pune", "Jaipur", "Kolkata"]
ranks = ["1st", "2nd", "3rd", "4th", "5th"]

raw_data = {
    "city": [random.choice(cities) for _ in range(60)],
    "rank": [random.choice(ranks) for _ in range(60)],
    "score1": [random.randint(30, 80) for _ in range(60)],
    "score2": [random.randint(50, 100) for _ in range(60)]
}

print(raw_data)
df2=pd.DataFrame(raw_data,columns=["city","rank","score1","score2"])
{'city': ['Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Bangalore', 'Pune', 'Kolkata', 'Faridabad', 'Hyderabad', 'Hyderabad', 'Hyderabad', 'Hyderabad', 'Pune', 'Faridabad', 'Kolkata', 'Bangalore', 'Hyderabad', 'Pune', 'Delhi', 'Kolkata', 'Jaipur', 'Kolkata', 'Mumbai', 'Jaipur', 'Noida', 'Bangalore', 'Bangalore', 'Chennai', 'Jaipur', 'Jaipur', 'Jaipur', 'Mumbai', 'Jaipur', 'Kolkata', 'Kolkata', 'Mumbai', 'Delhi', 'Pune', 'Chennai', 'Hyderabad', 'Mumbai', 'Bangalore', 'Pune', 'Jaipur', 'Jaipur', 'Bangalore', 'Faridabad', 'Hyderabad', 'Chennai', 'Kolkata', 'Mumbai', 'Faridabad', 'Delhi', 'Mumbai', 'Chennai', 'Faridabad', 'Faridabad', 'Noida', 'Faridabad', 'Kolkata'], 'rank': ['3rd', '1st', '1st', '2nd', '5th', '3rd', '3rd', '4th', '5th', '4th', '4th', '5th', '1st', '5th', '1st', '2nd', '1st', '1st', '1st', '3rd', '3rd', '2nd', '3rd', '2nd', '2nd', '4th', '4th', '5th', '1st', '4th', '5th', '1st', '3rd', '1st', '5th', '1st', '4th', '2nd', '2nd', '3rd', '4th', '2nd', '4th', '5th', '1st', '3rd', '2nd', '1st', '5th', '3rd', '1st', '3rd', '2nd', '2nd', '1st', '3rd', '3rd', '3rd', '5th', '5th'], 'score1': [50, 74, 59, 67, 63, 48, 32, 48, 54, 80, 72, 75, 61, 58, 70, 35, 40, 51, 47, 77, 76, 37, 45, 41, 30, 70, 60, 41, 61, 73, 41, 32, 66, 71, 70, 40, 57, 66, 43, 65, 61, 73, 65, 39, 52, 72, 67, 40, 70, 44, 80, 74, 79, 58, 62, 32, 36, 52, 43, 38], 'score2': [53, 80, 90, 51, 73, 100, 89, 65, 72, 84, 98, 67, 54, 100, 51, 87, 100, 67, 86, 94, 57, 54, 59, 66, 70, 82, 76, 92, 65, 76, 66, 97, 65, 63, 65, 54, 83, 94, 96, 82, 76, 84, 56, 69, 88, 53, 62, 55, 51, 59, 74, 62, 85, 51, 88, 96, 76, 55, 52, 76]}
In [51]:
df2
Out[51]:
city rank score1 score2
0 Delhi 3rd 50 53
1 Bangalore 1st 74 80
2 Hyderabad 1st 59 90
3 Pune 2nd 67 51
4 Bangalore 5th 63 73
5 Pune 3rd 48 100
6 Kolkata 3rd 32 89
7 Faridabad 4th 48 65
8 Hyderabad 5th 54 72
9 Hyderabad 4th 80 84
10 Hyderabad 4th 72 98
11 Hyderabad 5th 75 67
12 Pune 1st 61 54
13 Faridabad 5th 58 100
14 Kolkata 1st 70 51
15 Bangalore 2nd 35 87
16 Hyderabad 1st 40 100
17 Pune 1st 51 67
18 Delhi 1st 47 86
19 Kolkata 3rd 77 94
20 Jaipur 3rd 76 57
21 Kolkata 2nd 37 54
22 Mumbai 3rd 45 59
23 Jaipur 2nd 41 66
24 Noida 2nd 30 70
25 Bangalore 4th 70 82
26 Bangalore 4th 60 76
27 Chennai 5th 41 92
28 Jaipur 1st 61 65
29 Jaipur 4th 73 76
30 Jaipur 5th 41 66
31 Mumbai 1st 32 97
32 Jaipur 3rd 66 65
33 Kolkata 1st 71 63
34 Kolkata 5th 70 65
35 Mumbai 1st 40 54
36 Delhi 4th 57 83
37 Pune 2nd 66 94
38 Chennai 2nd 43 96
39 Hyderabad 3rd 65 82
40 Mumbai 4th 61 76
41 Bangalore 2nd 73 84
42 Pune 4th 65 56
43 Jaipur 5th 39 69
44 Jaipur 1st 52 88
45 Bangalore 3rd 72 53
46 Faridabad 2nd 67 62
47 Hyderabad 1st 40 55
48 Chennai 5th 70 51
49 Kolkata 3rd 44 59
50 Mumbai 1st 80 74
51 Faridabad 3rd 74 62
52 Delhi 2nd 79 85
53 Mumbai 2nd 58 51
54 Chennai 1st 62 88
55 Faridabad 3rd 32 96
56 Faridabad 3rd 36 76
57 Noida 3rd 52 55
58 Faridabad 5th 43 52
59 Kolkata 5th 38 76
In [55]:
#check for duplicate data
df2.duplicated()
#So, thereisno duplicate row in a dataframe
Out[55]:
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
51    False
52    False
53    False
54    False
55    False
56    False
57    False
58    False
59    False
dtype: bool
In [59]:
#check for duplicate rows in city
df2.duplicated(['city'])
#the first occurrences of data also treated as False
Out[59]:
0     False
1     False
2     False
3     False
4      True
5      True
6     False
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20    False
21     True
22    False
23     True
24    False
25     True
26     True
27    False
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42     True
43     True
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
52     True
53     True
54     True
55     True
56     True
57     True
58     True
59     True
dtype: bool
In [61]:
df2.duplicated(['rank'])
#the first occurrences of data also treated as False
Out[61]:
0     False
1     False
2      True
3     False
4     False
5      True
6      True
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42     True
43     True
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
52     True
53     True
54     True
55     True
56     True
57     True
58     True
59     True
dtype: bool
In [63]:
df2.duplicated(['rank'],keep='last')
# the last occurances is treated as True, when we classify keep as last
Out[63]:
0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42    False
43     True
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
52     True
53    False
54    False
55     True
56     True
57    False
58     True
59    False
dtype: bool
In [65]:
#checking duplicate values on the basis of combinations
df2.duplicated(['city','rank'])
Out[65]:
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11     True
12    False
13    False
14    False
15    False
16     True
17     True
18    False
19     True
20    False
21    False
22    False
23    False
24    False
25    False
26     True
27    False
28    False
29    False
30    False
31    False
32     True
33     True
34    False
35     True
36    False
37     True
38    False
39    False
40    False
41     True
42    False
43     True
44     True
45    False
46    False
47     True
48     True
49     True
50     True
51    False
52    False
53    False
54    False
55     True
56     True
57    False
58     True
59     True
dtype: bool
In [67]:
df2.drop_duplicates()
# No rows are dropped because there is no duplicate rows
Out[67]:
city rank score1 score2
0 Delhi 3rd 50 53
1 Bangalore 1st 74 80
2 Hyderabad 1st 59 90
3 Pune 2nd 67 51
4 Bangalore 5th 63 73
5 Pune 3rd 48 100
6 Kolkata 3rd 32 89
7 Faridabad 4th 48 65
8 Hyderabad 5th 54 72
9 Hyderabad 4th 80 84
10 Hyderabad 4th 72 98
11 Hyderabad 5th 75 67
12 Pune 1st 61 54
13 Faridabad 5th 58 100
14 Kolkata 1st 70 51
15 Bangalore 2nd 35 87
16 Hyderabad 1st 40 100
17 Pune 1st 51 67
18 Delhi 1st 47 86
19 Kolkata 3rd 77 94
20 Jaipur 3rd 76 57
21 Kolkata 2nd 37 54
22 Mumbai 3rd 45 59
23 Jaipur 2nd 41 66
24 Noida 2nd 30 70
25 Bangalore 4th 70 82
26 Bangalore 4th 60 76
27 Chennai 5th 41 92
28 Jaipur 1st 61 65
29 Jaipur 4th 73 76
30 Jaipur 5th 41 66
31 Mumbai 1st 32 97
32 Jaipur 3rd 66 65
33 Kolkata 1st 71 63
34 Kolkata 5th 70 65
35 Mumbai 1st 40 54
36 Delhi 4th 57 83
37 Pune 2nd 66 94
38 Chennai 2nd 43 96
39 Hyderabad 3rd 65 82
40 Mumbai 4th 61 76
41 Bangalore 2nd 73 84
42 Pune 4th 65 56
43 Jaipur 5th 39 69
44 Jaipur 1st 52 88
45 Bangalore 3rd 72 53
46 Faridabad 2nd 67 62
47 Hyderabad 1st 40 55
48 Chennai 5th 70 51
49 Kolkata 3rd 44 59
50 Mumbai 1st 80 74
51 Faridabad 3rd 74 62
52 Delhi 2nd 79 85
53 Mumbai 2nd 58 51
54 Chennai 1st 62 88
55 Faridabad 3rd 32 96
56 Faridabad 3rd 36 76
57 Noida 3rd 52 55
58 Faridabad 5th 43 52
59 Kolkata 5th 38 76
In [71]:
df2.drop_duplicates(['city'])
#all the duplicate cities are dropped
Out[71]:
city rank score1 score2
0 Delhi 3rd 50 53
1 Bangalore 1st 74 80
2 Hyderabad 1st 59 90
3 Pune 2nd 67 51
6 Kolkata 3rd 32 89
7 Faridabad 4th 48 65
20 Jaipur 3rd 76 57
22 Mumbai 3rd 45 59
24 Noida 2nd 30 70
27 Chennai 5th 41 92
In [75]:
df2.drop_duplicates(['city','rank'])
# all the duplicate rows are dropped on the basis of combination of city and␣rank column
Out[75]:
city rank score1 score2
0 Delhi 3rd 50 53
1 Bangalore 1st 74 80
2 Hyderabad 1st 59 90
3 Pune 2nd 67 51
4 Bangalore 5th 63 73
5 Pune 3rd 48 100
6 Kolkata 3rd 32 89
7 Faridabad 4th 48 65
8 Hyderabad 5th 54 72
9 Hyderabad 4th 80 84
12 Pune 1st 61 54
13 Faridabad 5th 58 100
14 Kolkata 1st 70 51
15 Bangalore 2nd 35 87
18 Delhi 1st 47 86
20 Jaipur 3rd 76 57
21 Kolkata 2nd 37 54
22 Mumbai 3rd 45 59
23 Jaipur 2nd 41 66
24 Noida 2nd 30 70
25 Bangalore 4th 70 82
27 Chennai 5th 41 92
28 Jaipur 1st 61 65
29 Jaipur 4th 73 76
30 Jaipur 5th 41 66
31 Mumbai 1st 32 97
34 Kolkata 5th 70 65
36 Delhi 4th 57 83
38 Chennai 2nd 43 96
39 Hyderabad 3rd 65 82
40 Mumbai 4th 61 76
42 Pune 4th 65 56
45 Bangalore 3rd 72 53
46 Faridabad 2nd 67 62
51 Faridabad 3rd 74 62
52 Delhi 2nd 79 85
53 Mumbai 2nd 58 51
54 Chennai 1st 62 88
57 Noida 3rd 52 55
In [77]:
df.describe()
Out[77]:
Row ID Ship Date Postal Code Sales Quantity Discount Profit
count 9994.000000 9994 9983.000000 9994.000000 9994.000000 9994.000000 9994.000000
mean 4997.500000 2021-05-04 04:17:20.304182528 55245.233297 229.858001 3.789574 0.156203 28.656896
min 1.000000 2019-01-07 00:00:00 1040.000000 0.444000 1.000000 0.000000 -6599.978000
25% 2499.250000 2020-05-27 00:00:00 23223.000000 17.280000 2.000000 0.000000 1.728750
50% 4997.500000 2021-06-29 00:00:00 57103.000000 54.490000 3.000000 0.200000 8.666500
75% 7495.750000 2022-05-18 00:00:00 90008.000000 209.940000 5.000000 0.200000 29.364000
max 9994.000000 2023-01-05 00:00:00 99301.000000 22638.480000 14.000000 0.800000 8399.976000
std 2885.163629 NaN 32038.715955 623.245101 2.225110 0.206452 234.260108

1.1 What is an outlier ?

• A data point which is significantly far away from other data points
• IQR (Inter Quartile Range)
• IQR = Q3 - Q1
• Lower Boundary = Q1 - (1.5 * IQR)
• Upper Boundary = Q3 + (1.5 * IQR)
In [89]:
df1.head()
Out[89]:
Order ID Ship Date Ship Mode Customer ID Customer Name Segment Country City State Region ... Category Sub-Category Product Name Sales Quantity Profit Returns Payment Mode ind1 ind2
0 CA-2019-160304 2019-01-07 Standard Class BM-11575 Brendan Murry Corporate United States Gaithersburg Maryland East ... Furniture Bookcases Bush Westfield Collection Bookcases, Medium Ch... 73.94 1 28.2668 NaN Online NaN NaN
4 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Accessories Imation Clip USB flash drive - 8 GB 30.08 2 -5.2640 NaN Online NaN NaN
5 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Accessories WD My Passport Ultra 1TB Portable External Har... 165.60 3 -6.2100 NaN Online NaN NaN
6 US-2019-116365 2019-01-08 Standard Class CA-12310 Christine Abelman Corporate United States San Antonio Texas Central ... Technology Phones AT&T 17929 Lendline Telephone 180.96 5 13.5720 NaN Cards NaN NaN
7 CA-2019-105207 2019-01-08 Standard Class BO-11350 Bill Overfelt Corporate United States Broken Arrow Oklahoma Central ... Furniture Tables Hon Practical Foundations 30 x 60 Training Tab... 1592.85 7 350.4270 NaN COD NaN NaN

5 rows × 21 columns

In [91]:
df1['Profit'].describe()
Out[91]:
count    5898.000000
mean       29.692769
std       259.654675
min     -6599.978000
25%         1.794375
50%         8.502250
75%        28.436400
max      8399.976000
Name: Profit, dtype: float64
In [93]:
import seaborn as sns
In [95]:
sns.boxplot(x=df1['Profit'])
Out[95]:
<Axes: xlabel='Profit'>
No description has been provided for this image
In [97]:
sns.boxplot(x=df1['Sales'])
Out[97]:
<Axes: xlabel='Sales'>
No description has been provided for this image
In [99]:
sns.boxplot(x=df1['Quantity'])
Out[99]:
<Axes: xlabel='Quantity'>
No description has been provided for this image
In [101]:
df1.shape
Out[101]:
(5898, 21)
In [105]:
# Let's calculate first quartile,third_quartile and inter_quartile range
first_quartile = df1['Profit'].quantile(.25)
third_quartile = df1['Profit'].quantile(.75)
IQR = third_quartile - first_quartile
In [107]:
new_boundary = third_quartile + 3 * IQR
In [115]:
new_boundary = third_quartile + 3 * IQR
# dropped the outliers data
new_data = df1.drop(df1[df1['Profit']>new_boundary].index, axis = 0,inplace=False)
# 12 rows are dropped
new_data.shape
Out[115]:
(5423, 21)
In [117]:
sns.boxplot(x=new_data['Profit'])
Out[117]:
<Axes: xlabel='Profit'>
No description has been provided for this image
In [125]:
# Let's calculate first quartile,third_quartile and inter_quartile range
first_quartile1 = df1['Quantity'].quantile(.25)
third_quartile1 = df1['Quantity'].quantile(.75)
IQR = third_quartile - first_quartile
In [127]:
new_boundary1 = third_quartile1 + 3 * IQR
In [129]:
# dropped the outliers data
new_data1 = df1.drop(df1[df1['Quantity']>new_boundary1].index, axis = 0,inplace=False)
# 12 rows are dropped
new_data1.shape
Out[129]:
(5898, 21)
In [135]:
sns.boxplot(x=new_data1['Quantity'])
Out[135]:
<Axes: xlabel='Quantity'>
No description has been provided for this image

1.2 Outliers detection using IQR¶

In [138]:
import random

# List of names (50 names)
names = [
    'Mohan', 'Maria', 'Deepak', 'Kunal', 'Piyush', 'Avinash', 'Lisa', 'Smita', 'Tanu', 'Khusboo',
    'Nishant', 'Johnson', 'Donald', 'Rakesh', 'Pritvi', 'Roy', 'Ashish', 'Abhishek', 'Jassi', 'Nikita',
    'Aman', 'Divya', 'Sneha', 'Ritu', 'Anuj', 'Chetan', 'Saurabh', 'Rohit', 'Gaurav', 'Meena',
    'Seema', 'Priya', 'Alok', 'Sanya', 'Nilesh', 'Vikas', 'Ankita', 'Suraj', 'Bhavna', 'Kiran',
    'Simran', 'Radhika', 'Vivek', 'Pranav', 'Hemant', 'Aditi', 'Komal', 'Tarun', 'Neha', 'Ishaan'
]

# Generate random realistic heights between 1.4m and 2.1m
heights = [round(random.uniform(1.4, 2.1), 2) for _ in range(50)]

# Create the DataFrame
raw = {
    'name': names,
    'height': heights
}

df3 = pd.DataFrame(raw)
In [140]:
df3
Out[140]:
name height
0 Mohan 1.82
1 Maria 1.78
2 Deepak 1.77
3 Kunal 1.77
4 Piyush 1.60
5 Avinash 1.89
6 Lisa 1.82
7 Smita 1.89
8 Tanu 1.89
9 Khusboo 1.45
10 Nishant 1.95
11 Johnson 1.42
12 Donald 2.01
13 Rakesh 2.01
14 Pritvi 1.44
15 Roy 1.92
16 Ashish 1.71
17 Abhishek 1.84
18 Jassi 1.65
19 Nikita 1.61
20 Aman 1.78
21 Divya 1.52
22 Sneha 1.88
23 Ritu 1.78
24 Anuj 1.73
25 Chetan 1.50
26 Saurabh 1.57
27 Rohit 2.09
28 Gaurav 1.43
29 Meena 1.92
30 Seema 1.50
31 Priya 1.85
32 Alok 1.89
33 Sanya 1.59
34 Nilesh 1.93
35 Vikas 1.67
36 Ankita 1.76
37 Suraj 2.10
38 Bhavna 1.48
39 Kiran 2.09
40 Simran 1.52
41 Radhika 1.42
42 Vivek 1.52
43 Pranav 1.92
44 Hemant 1.54
45 Aditi 1.83
46 Komal 1.99
47 Tarun 1.48
48 Neha 1.73
49 Ishaan 1.92
In [142]:
df3.describe()
Out[142]:
height
count 50.00000
mean 1.74340
std 0.20022
min 1.42000
25% 1.54750
50% 1.77500
75% 1.89000
max 2.10000
In [144]:
import pandas as pd
import random
import numpy as np

# Names (50 different names)
names = [
    'Mohan', 'Maria', 'Deepak', 'Kunal', 'Piyush', 'Avinash', 'Lisa', 'Smita', 'Tanu', 'Khusboo',
    'Nishant', 'Johnson', 'Donald', 'Rakesh', 'Pritvi', 'Roy', 'Ashish', 'Abhishek', 'Jassi', 'Nikita',
    'Aman', 'Divya', 'Sneha', 'Ritu', 'Anuj', 'Chetan', 'Saurabh', 'Rohit', 'Gaurav', 'Meena',
    'Seema', 'Priya', 'Alok', 'Sanya', 'Nilesh', 'Vikas', 'Ankita', 'Suraj', 'Bhavna', 'Kiran',
    'Simran', 'Radhika', 'Vivek', 'Pranav', 'Hemant', 'Aditi', 'Komal', 'Tarun', 'Neha', 'Ishaan'
]

# Generate 45 "normal" heights around 5.5-6.5 range
normal_heights = [round(random.uniform(4.5, 7.0), 2) for _ in range(45)]

# Add 5 "outlier" very large heights
outliers = [round(random.uniform(12.0, 40.0), 2) for _ in range(5)]

# Combine and shuffle
heights = normal_heights + outliers
random.shuffle(heights)

# Create DataFrame
raw = {
    'name': names,
    'height': heights
}

df4 = pd.DataFrame(raw)
In [146]:
df4.describe()
Out[146]:
height
count 50.000000
mean 8.324000
std 7.869812
min 4.620000
25% 5.370000
50% 5.830000
75% 6.480000
max 37.800000
In [148]:
# Calculating the Q1 ,Q3
Q1 = df4.height.quantile(.25)
Q3 = df4.height.quantile(.75)
Q1 , Q3
Out[148]:
(5.37, 6.4799999999999995)
In [150]:
# Calculating IQR
IQR = Q3 - Q1
IQR
Out[150]:
1.1099999999999994
In [152]:
# Calculating IQR
lower_limit = Q1 - 1.5 * (IQR)
upper_limit = Q3 + 1.5 * (IQR)
lower_limit,upper_limit
Out[152]:
(3.705000000000001, 8.145)
In [156]:
#outliers
df4[(df4.height<lower_limit)|(df4.height>upper_limit)]
Out[156]:
name height
17 Abhishek 34.14
31 Priya 23.60
34 Nilesh 37.80
42 Vivek 26.07
45 Aditi 33.73
In [158]:
# data with no outliers
df_no_outliers = df4[(df4.height>lower_limit) & (df4.height < upper_limit) ]
In [160]:
df_no_outliers
Out[160]:
name height
0 Mohan 6.83
1 Maria 4.77
2 Deepak 6.01
3 Kunal 6.28
4 Piyush 5.76
5 Avinash 6.84
6 Lisa 5.90
7 Smita 6.96
8 Tanu 6.82
9 Khusboo 6.36
10 Nishant 6.96
11 Johnson 5.10
12 Donald 5.37
13 Rakesh 5.90
14 Pritvi 4.83
15 Roy 6.35
16 Ashish 5.82
18 Jassi 5.27
19 Nikita 5.72
20 Aman 5.82
21 Divya 4.62
22 Sneha 5.70
23 Ritu 4.80
24 Anuj 6.32
25 Chetan 5.37
26 Saurabh 5.94
27 Rohit 5.65
28 Gaurav 5.45
29 Meena 4.78
30 Seema 5.90
32 Alok 5.19
33 Sanya 5.73
35 Vikas 5.69
36 Ankita 5.39
37 Suraj 6.52
38 Bhavna 5.84
39 Kiran 5.36
40 Simran 6.33
41 Radhika 5.67
43 Pranav 5.08
44 Hemant 5.09
46 Komal 6.08
47 Tarun 5.32
48 Neha 6.84
49 Ishaan 6.53
In [164]:
df5 =pd.read_csv("D:/docs/dataset/archive/diabetes.csv")
In [167]:
df5.sample(5)
Out[167]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
212 7 179 95 31 0 34.2 0.164 60 0
499 6 154 74 32 193 29.3 0.839 39 0
488 4 99 72 17 0 25.6 0.294 28 0
268 0 102 52 0 0 25.1 0.078 21 0
454 2 100 54 28 105 37.8 0.498 24 0
In [172]:
df5.shape
Out[172]:
(768, 9)
In [176]:
plt.hist(df5.BMI,bins=20,rwidth=0.8)
plt.xlabel('Body Mass Index (BMI)')
plt.ylabel('Count')
plt.show()
No description has been provided for this image

Refer to https://www.mathsisfun.com/data/standard-normal-distribution.html for more details

In [180]:
from scipy.stats import norm
import numpy as np
plt.hist(df5.BMI,bins=20,rwidth=0.8,density=True)
plt.xlabel('BMI')
plt.ylabel('Count')
rng= np.arange(df5.BMI.min(),df5.BMI.max(),0.1)
plt.plot(rng,norm.pdf(rng,df5.BMI.mean(),df5.BMI.std()))
Out[180]:
[<matplotlib.lines.Line2D at 0x264af9c21b0>]
No description has been provided for this image
In [182]:
df5.BMI.std()
Out[182]:
7.8841603203754405
In [184]:
df5.BMI.mean()
Out[184]:
31.992578124999998
In [186]:
upper_limit = df5.BMI.mean() + 3 * df5.BMI.std()
upper_limit
Out[186]:
55.645059086126324
In [188]:
#Calculating lower_limit
lower_limit = df5.BMI.mean() - 3 * df5.BMI.std()
lower_limit
Out[188]:
8.340097163873676
In [190]:
# checking the outliers
df5[(df5.BMI>upper_limit )| (df5.BMI <lower_limit)]
Out[190]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
9 8 125 96 0 0 0.0 0.232 54 1
49 7 105 0 0 0 0.0 0.305 24 0
60 2 84 0 0 0 0.0 0.304 21 0
81 2 74 0 0 0 0.0 0.102 22 0
145 0 102 75 23 0 0.0 0.572 21 0
177 0 129 110 46 130 67.1 0.319 26 1
371 0 118 64 23 89 0.0 1.731 21 0
426 0 94 0 0 0 0.0 0.256 25 0
445 0 180 78 63 14 59.4 2.420 25 1
494 3 80 0 0 0 0.0 0.174 22 0
522 6 114 0 0 0 0.0 0.189 26 0
673 3 123 100 35 240 57.3 0.880 22 0
684 5 136 82 0 0 0.0 0.640 69 0
706 10 115 0 0 0 0.0 0.261 30 1
In [196]:
df_no_outlier_std_dev = df[(df5.BMI<upper_limit ) & (df5.BMI>lower_limit)]
df_no_outlier_std_dev.shape
Out[196]:
(754, 9)
In [198]:
df5.shape[0] - df_no_outlier_std_dev.shape[0]
Out[198]:
14

1.3 Outlier detection and removal using Z score Z score indicates how many standard deviation away a datapoint is. For example in our case mean is 66.37 and the standard deviation is 3.84 If a value of data point is 77.91, then Z score for that is 3 because it is 3 standard deviation away (77.91 = 66.37 + 3 * 3.84) Score is refer to data point below 1.4 z = (x – ￿) / ￿

In [201]:
df5['zscore'] = (df5.BMI - df5.BMI.mean())/df5.BMI.std()
In [203]:
df5.head(5)
Out[203]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome zscore
0 6 148 72 35 0 33.6 0.627 50 1 0.203880
1 1 85 66 29 0 26.6 0.351 31 0 -0.683976
2 8 183 64 0 0 23.3 0.672 32 1 -1.102537
3 1 89 66 23 94 28.1 0.167 21 0 -0.493721
4 0 137 40 35 168 43.1 2.288 33 1 1.408828
In [205]:
#outliers
df5[(df5['zscore']>3) | (df5['zscore']<-3)]
Out[205]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome zscore
9 8 125 96 0 0 0.0 0.232 54 1 -4.057829
49 7 105 0 0 0 0.0 0.305 24 0 -4.057829
60 2 84 0 0 0 0.0 0.304 21 0 -4.057829
81 2 74 0 0 0 0.0 0.102 22 0 -4.057829
145 0 102 75 23 0 0.0 0.572 21 0 -4.057829
177 0 129 110 46 130 67.1 0.319 26 1 4.452906
371 0 118 64 23 89 0.0 1.731 21 0 -4.057829
426 0 94 0 0 0 0.0 0.256 25 0 -4.057829
445 0 180 78 63 14 59.4 2.420 25 1 3.476264
494 3 80 0 0 0 0.0 0.174 22 0 -4.057829
522 6 114 0 0 0 0.0 0.189 26 0 -4.057829
673 3 123 100 35 240 57.3 0.880 22 0 3.209907
684 5 136 82 0 0 0.0 0.640 69 0 -4.057829
706 10 115 0 0 0 0.0 0.261 30 1 -4.057829
In [207]:
#removing outliers as per Z score
df_no_outliers = df5[(df5.zscore <3) & (df5.zscore > -3)]
df_no_outliers.head()
Out[207]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome zscore
0 6 148 72 35 0 33.6 0.627 50 1 0.203880
1 1 85 66 29 0 26.6 0.351 31 0 -0.683976
2 8 183 64 0 0 23.3 0.672 32 1 -1.102537
3 1 89 66 23 94 28.1 0.167 21 0 -0.493721
4 0 137 40 35 168 43.1 2.288 33 1 1.408828
In [209]:
#checkig the number of ouliers
df5.shape[0] - df_no_outliers.shape[0]
Out[209]:
14
In [213]:
df5.head()
Out[213]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome zscore
0 6 148 72 35 0 33.6 0.627 50 1 0.203880
1 1 85 66 29 0 26.6 0.351 31 0 -0.683976
2 8 183 64 0 0 23.3 0.672 32 1 -1.102537
3 1 89 66 23 94 28.1 0.167 21 0 -0.493721
4 0 137 40 35 168 43.1 2.288 33 1 1.408828
In [217]:
df5['Pregnancies']
Out[217]:
0       6
1       1
2       8
3       1
4       0
       ..
763    10
764     2
765     5
766     1
767     1
Name: Pregnancies, Length: 768, dtype: int64
In [219]:
from collections import Counter
In [221]:
Counter(df5.Pregnancies)
Out[221]:
Counter({1: 135,
         0: 111,
         2: 103,
         3: 75,
         4: 68,
         5: 57,
         6: 50,
         7: 45,
         8: 38,
         9: 28,
         10: 24,
         11: 11,
         13: 10,
         12: 9,
         14: 2,
         15: 1,
         17: 1})
In [223]:
# to get the frequency
df.Pregnancies.value_counts()
Out[223]:
Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: count, dtype: int64
In [227]:
#checking the type
type(df5.Pregnancies.value_counts())
Out[227]:
pandas.core.series.Series
In [231]:
g1 = df5['Pregnancies'].value_counts()
In [233]:
g1.plot()
Out[233]:
<Axes: xlabel='Pregnancies'>
No description has been provided for this image
In [235]:
#Visualizing through a bar chart
g1.plot(kind='bar')
Out[235]:
<Axes: xlabel='Pregnancies'>
No description has been provided for this image
In [237]:
#Visualizing through pie chart
g1.plot(kind='pie')
Out[237]:
<Axes: ylabel='count'>
No description has been provided for this image

Refer to https://medium.com/analytics-vidhya/intro-to-univariate-analysis-de75454b4719 for more details

In [246]:
tdf = pd.read_csv('D:/Pdf/college/titanic.csv')
In [248]:
tdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  156 non-null    int64  
 1   Survived     156 non-null    int64  
 2   Pclass       156 non-null    int64  
 3   Lname        156 non-null    object 
 4   Name         156 non-null    object 
 5   Sex          156 non-null    object 
 6   Age          126 non-null    float64
 7   SibSp        156 non-null    int64  
 8   Parch        156 non-null    int64  
 9   Ticket       156 non-null    object 
 10  Fare         156 non-null    float64
 11  Cabin        31 non-null     object 
 12  Embarked     155 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 16.0+ KB
In [250]:
#checking top 5 recordsof the dataset
tdf.head()
Out[250]:
PassengerId Survived Pclass Lname Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings Mrs. John Bradley (Florence Briggs Thayer) female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [252]:
#Let's visualize the survivors count
sns.countplot(x=tdf.Survived)
plt.show()
No description has been provided for this image
In [254]:
# Lets check how many males and females were survived
sns.countplot(x=titanic_df.Survived,hue=titanic_df.Sex)
plt.show()
No description has been provided for this image
In [256]:
#Let's distinguish the data by passenger class
sns.countplot(x=titanic_df.Survived,hue=titanic_df.Pclass)
plt.show()
No description has been provided for this image

1.4.1 Numerical Data

In [259]:
tips_df = pd.read_csv('D:/docs/downloads/tips.csv')
tips_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
In [261]:
#checking top 5 recordsof the dataset
tips_df.head()
Out[261]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [263]:
#distribution of tips
sns.distplot(x=tips_df.tip, hist=True, kde=True, color='r')
Out[263]:
<Axes: ylabel='Density'>
No description has been provided for this image
In [265]:
# to find out the range
sns.boxplot(tips_df.tip, color = 'b')
Out[265]:
<Axes: ylabel='tip'>
No description has been provided for this image
In [267]:
fig, axes = plt.subplots(1,2, figsize =(12,6))
#distribution of tips
sns.distplot(x=tips_df.tip, hist=True, kde=True, color='r',ax=axes[0])
# to find out the range
sns.boxplot(tips_df.tip, color = 'b',ax=axes[1])
axes[0].set_title("Distribution of Tips",fontsize=15)
axes[1].set_title("Tips Range",fontsize=15)
Out[267]:
Text(0.5, 1.0, 'Tips Range')
No description has been provided for this image

1.5 Subplots

In [270]:
x = np.linspace(-1,1,101)
y1 = x
y2 = x**2
y3 = x**3
y4 = x**4
In [272]:
fig,ax = plt.subplots(2,1)
ax[0].plot(x,y1,color ='b')
ax[1].plot(x,y2, color='r')
Out[272]:
[<matplotlib.lines.Line2D at 0x264b43b4b30>]
No description has been provided for this image
In [274]:
fig,ax = plt.subplots(2,2)
ax[0,0].plot(x,y1, color ='b')
ax[0,1].plot(x,y2, color='r')
ax[1,0].plot(x,y3, color='y')
ax[1,1].plot(x,y4, color='b')
fig.show()
No description has been provided for this image
In [327]:
#Load the dataset
d = pd.read_csv('C:/Users/Acer/Downloads/diamonds.csv')
In [329]:
d.head(20)
Out[329]:
Unnamed: 0 carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
5 6 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
6 7 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
7 8 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
8 9 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
9 10 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39
10 11 0.30 Good J SI1 64.0 55.0 339 4.25 4.28 2.73
11 12 0.23 Ideal J VS1 62.8 56.0 340 3.93 3.90 2.46
12 13 0.22 Premium F SI1 60.4 61.0 342 3.88 3.84 2.33
13 14 0.31 Ideal J SI2 62.2 54.0 344 4.35 4.37 2.71
14 15 0.20 Premium E SI2 60.2 62.0 345 3.79 3.75 2.27
15 16 0.32 Premium E I1 60.9 58.0 345 4.38 4.42 2.68
16 17 0.30 Ideal I SI2 62.0 54.0 348 4.31 4.34 2.68
17 18 0.30 Good J SI1 63.4 54.0 351 4.23 4.29 2.70
18 19 0.30 Good J SI1 63.8 56.0 351 4.23 4.26 2.71
19 20 0.30 Very Good J SI1 62.7 59.0 351 4.21 4.27 2.66
In [331]:
#checking the shape
d.shape
Out[331]:
(53940, 11)
In [335]:
#filter the value where clarity is in {'SI1','VI2'}
d1 =d[d.clarity.isin(['SI1','VS2'])]
In [337]:
#checkig the shape after filtering
d1.shape
Out[337]:
(25323, 11)

Countplots

In [339]:
sns.set_style('darkgrid')
In [341]:
sns.countplot(x='color',data=d1)
Out[341]:
<Axes: xlabel='color', ylabel='count'>
No description has been provided for this image
In [345]:
#checking the value counts of color
d1.color.value_counts()
Out[345]:
color
E    4896
F    4332
G    4323
H    3918
D    3780
I    2593
J    1481
Name: count, dtype: int64
In [347]:
#use y to change the orientation instead of x
sns.countplot(y='cut',data=d1)
Out[347]:
<Axes: xlabel='count', ylabel='cut'>
No description has been provided for this image
In [351]:
#checking the datatypes
d1.dtypes
Out[351]:
Unnamed: 0      int64
carat         float64
cut            object
color          object
clarity        object
depth         float64
table         float64
price           int64
x             float64
y             float64
z             float64
dtype: object
In [353]:
#Providing the order
color_order = ['J','I','H','G','F','E','D']
sns.countplot(x='color',data=d1,order=color_order);
30
Out[353]:
30
No description has been provided for this image

1.6 Order Ascending or Descending

In [356]:
#checking the value counts
d1.color.value_counts()
Out[356]:
color
E    4896
F    4332
G    4323
H    3918
D    3780
I    2593
J    1481
Name: count, dtype: int64
In [358]:
#checking the Index
d1.color.value_counts().index
Out[358]:
Index(['E', 'F', 'G', 'H', 'D', 'I', 'J'], dtype='object', name='color')
In [362]:
#plotting as per index (plotting in descending order)
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index);
No description has been provided for this image
In [364]:
#plotting in ascending order
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1]);
No description has been provided for this image
In [366]:
# setting the color to Black
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1],color='lightblue');
No description has been provided for this image
In [368]:
# setting the color to Black
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1],palette='twilight');
No description has been provided for this image
In [370]:
#line width and edge color
sns.countplot(x='color',data=d1, lw=4, ec='black')
Out[370]:
<Axes: xlabel='color', ylabel='count'>
No description has been provided for this image
In [372]:
#hatching
sns.countplot(x='color',data=d1, lw=4, ec='black',hatch='/')
Out[372]:
<Axes: xlabel='color', ylabel='count'>
No description has been provided for this image

1.7 Scatter plot

In [375]:
y1 = np.random.randn(70)
y2 = np.random.randn(70)
# Both y1 and y2 should be of same size
plt.scatter(y1,y2)
plt.show()
No description has been provided for this image
In [377]:
## adjust axis, if required
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(y1,y2)
plt.axis([-3,3,-3,3]) #plt.axis([Xmin,Xmax,Ymin,Ymax])
plt.show()
No description has been provided for this image
In [379]:
# using the scatter plot with comman axis
x = np.arange(70)
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(x,y1, marker='o', label ='y1 with x')
plt.scatter(x,y2, marker='v', label ='y2 with x')
plt.legend(loc='upper left')
Out[379]:
<matplotlib.legend.Legend at 0x264b5293ec0>
No description has been provided for this image
In [381]:
#assigning the colors
x = np.arange(70)
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(x,y1, marker='o', label ='y1 with x',color='g')
plt.scatter(x,y2, marker='v', label ='y2 with x',color='b')
plt.legend(loc='upper left')
Out[381]:
<matplotlib.legend.Legend at 0x264b7b6c320>
No description has been provided for this image

1.8 Line Plot

In [408]:
!pip install yfinance
Collecting yfinance
  Downloading yfinance-0.2.56-py2.py3-none-any.whl.metadata (5.8 kB)
Requirement already satisfied: pandas>=1.3.0 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.2.2)
Requirement already satisfied: numpy>=1.16.5 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (1.26.4)
Requirement already satisfied: requests>=2.31 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.32.2)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Requirement already satisfied: platformdirs>=2.0.0 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (3.10.0)
Requirement already satisfied: pytz>=2022.5 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2024.1)
Requirement already satisfied: frozendict>=2.3.4 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.4.2)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.9.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
      --------------------------------------- 0.0/3.0 MB ? eta -:--:--
      --------------------------------------- 0.0/3.0 MB ? eta -:--:--
     - -------------------------------------- 0.1/3.0 MB 1.8 MB/s eta 0:00:02
     -- ------------------------------------- 0.2/3.0 MB 1.2 MB/s eta 0:00:03
     -- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
     -- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
     -- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
     ---- ----------------------------------- 0.3/3.0 MB 996.0 kB/s eta 0:00:03
     ----- ---------------------------------- 0.4/3.0 MB 983.0 kB/s eta 0:00:03
     ----- ---------------------------------- 0.4/3.0 MB 995.2 kB/s eta 0:00:03
     ------ --------------------------------- 0.5/3.0 MB 1.1 MB/s eta 0:00:03
     ------- -------------------------------- 0.6/3.0 MB 1.1 MB/s eta 0:00:03
     -------- ------------------------------- 0.6/3.0 MB 1.2 MB/s eta 0:00:03
     -------- ------------------------------- 0.6/3.0 MB 1.2 MB/s eta 0:00:03
     --------- ------------------------------ 0.7/3.0 MB 1.1 MB/s eta 0:00:03
     --------- ------------------------------ 0.7/3.0 MB 1.1 MB/s eta 0:00:03
     ----------- ---------------------------- 0.8/3.0 MB 1.1 MB/s eta 0:00:02
     ----------- ---------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:03
     ------------ --------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:02
     ------------ --------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:02
     ------------- -------------------------- 1.0/3.0 MB 1.1 MB/s eta 0:00:02
     ------------- -------------------------- 1.0/3.0 MB 1.0 MB/s eta 0:00:02
     -------------- ------------------------- 1.1/3.0 MB 1.1 MB/s eta 0:00:02
     --------------- ------------------------ 1.2/3.0 MB 1.1 MB/s eta 0:00:02
     --------------- ------------------------ 1.2/3.0 MB 1.1 MB/s eta 0:00:02
     --------------- ------------------------ 1.2/3.0 MB 982.6 kB/s eta 0:00:02
     ---------------- ----------------------- 1.2/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------- ---------------------- 1.3/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------- ---------------------- 1.3/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------ --------------------- 1.4/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------ --------------------- 1.4/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------- -------------------- 1.5/3.0 MB 1.0 MB/s eta 0:00:02
     -------------------- ------------------- 1.5/3.0 MB 1.0 MB/s eta 0:00:02
     -------------------- ------------------- 1.6/3.0 MB 1.0 MB/s eta 0:00:02
     -------------------- ------------------- 1.6/3.0 MB 1.0 MB/s eta 0:00:02
     --------------------- ------------------ 1.6/3.0 MB 992.2 kB/s eta 0:00:02
     --------------------- ------------------ 1.6/3.0 MB 992.2 kB/s eta 0:00:02
     ---------------------- ----------------- 1.7/3.0 MB 991.5 kB/s eta 0:00:02
     ----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
     ----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
     ------------------------ --------------- 1.9/3.0 MB 966.5 kB/s eta 0:00:02
     ------------------------- -------------- 1.9/3.0 MB 977.7 kB/s eta 0:00:02
     ------------------------- -------------- 2.0/3.0 MB 965.1 kB/s eta 0:00:02
     -------------------------- ------------- 2.0/3.0 MB 980.5 kB/s eta 0:00:02
     --------------------------- ------------ 2.1/3.0 MB 975.5 kB/s eta 0:00:01
     --------------------------- ------------ 2.1/3.0 MB 968.5 kB/s eta 0:00:01
     ---------------------------- ----------- 2.1/3.0 MB 966.4 kB/s eta 0:00:01
     ---------------------------- ----------- 2.2/3.0 MB 971.3 kB/s eta 0:00:01
     ----------------------------- ---------- 2.2/3.0 MB 967.1 kB/s eta 0:00:01
     ----------------------------- ---------- 2.2/3.0 MB 967.1 kB/s eta 0:00:01
     ------------------------------ --------- 2.3/3.0 MB 957.2 kB/s eta 0:00:01
     ------------------------------ --------- 2.3/3.0 MB 945.0 kB/s eta 0:00:01
     ------------------------------- -------- 2.4/3.0 MB 968.4 kB/s eta 0:00:01
     ------------------------------- -------- 2.4/3.0 MB 968.4 kB/s eta 0:00:01
     -------------------------------- ------- 2.5/3.0 MB 952.9 kB/s eta 0:00:01
     -------------------------------- ------- 2.5/3.0 MB 945.5 kB/s eta 0:00:01
     --------------------------------- ------ 2.5/3.0 MB 953.8 kB/s eta 0:00:01
     --------------------------------- ------ 2.5/3.0 MB 953.8 kB/s eta 0:00:01
     ---------------------------------- ----- 2.6/3.0 MB 943.5 kB/s eta 0:00:01
     ----------------------------------- ---- 2.7/3.0 MB 953.3 kB/s eta 0:00:01
     ----------------------------------- ---- 2.7/3.0 MB 936.1 kB/s eta 0:00:01
     ------------------------------------ --- 2.7/3.0 MB 938.6 kB/s eta 0:00:01
     ------------------------------------ --- 2.7/3.0 MB 938.6 kB/s eta 0:00:01
     ------------------------------------- -- 2.8/3.0 MB 931.8 kB/s eta 0:00:01
     ------------------------------------- -- 2.9/3.0 MB 938.9 kB/s eta 0:00:01
     -------------------------------------- - 2.9/3.0 MB 941.2 kB/s eta 0:00:01
     -------------------------------------- - 2.9/3.0 MB 941.2 kB/s eta 0:00:01
     ---------------------------------------  3.0/3.0 MB 929.7 kB/s eta 0:00:01
     ---------------------------------------  3.0/3.0 MB 924.3 kB/s eta 0:00:01
     ---------------------------------------- 3.0/3.0 MB 923.4 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Requirement already satisfied: beautifulsoup4>=4.11.1 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\acer\anaconda3\lib\site-packages (from beautifulsoup4>=4.11.1->yfinance) (2.5)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=1.3.0->yfinance) (2.9.0.post0)
Requirement already satisfied: tzdata>=2022.7 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=1.3.0->yfinance) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2024.12.14)
Requirement already satisfied: six>=1.5 in c:\users\acer\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.3.0->yfinance) (1.16.0)
Downloading yfinance-0.2.56-py2.py3-none-any.whl (113 kB)
   ---------------------------------------- 0.0/113.7 kB ? eta -:--:--
   --- ------------------------------------ 10.2/113.7 kB ? eta -:--:--
   -------------- ------------------------ 41.0/113.7 kB 653.6 kB/s eta 0:00:01
   ------------------------------- ------- 92.2/113.7 kB 871.5 kB/s eta 0:00:01
   -------------------------------------- 113.7/113.7 kB 663.1 kB/s eta 0:00:00
Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Building wheels for collected packages: peewee
  Building wheel for peewee (pyproject.toml): started
  Building wheel for peewee (pyproject.toml): finished with status 'done'
  Created wheel for peewee: filename=peewee-3.17.9-py3-none-any.whl size=139127 sha256=d4e0fc6219e187c4aedf623d122849492ef79b7793b740305def523e358a9d50
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\43\ef\2d\2c51d496bf084945ffdf838b4cc8767b8ba1cc20eb41588831
Successfully built peewee
Installing collected packages: peewee, multitasking, yfinance
Successfully installed multitasking-0.0.11 peewee-3.17.9 yfinance-0.2.56
In [400]:
#pip install --upgrade pandas_datareader
Requirement already satisfied: pandas_datareader in c:\users\acer\anaconda3\lib\site-packages (0.10.0)
Requirement already satisfied: lxml in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (5.2.1)
Requirement already satisfied: pandas>=0.23 in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (2.2.2)
Requirement already satisfied: requests>=2.19.0 in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (2.32.2)
Requirement already satisfied: numpy>=1.26.0 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2024.12.14)
Requirement already satisfied: six>=1.5 in c:\users\acer\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.23->pandas_datareader) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [410]:
# extract data from various Internet sources into a pandas DataFrame
#import pandas_datareader as pdr
import yfinance as yf
In [412]:
#extracting Data
stocks = ['GOOG', 'AMZN']
#data = pdr.get_data_yahoo(stocks, start = '2022-01-01')['Close']
data = yf.download(stocks, start='2022-01-01')['Close']
data.head()
YF.download() has changed argument auto_adjust default to True
[*********************100%***********************]  2 of 2 completed

1 Failed download:
['GOOG']: ConnectionError(ReadTimeoutError("HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out."))
Out[412]:
Ticker AMZN GOOG
Date
2022-01-03 170.404495 NaN
2022-01-04 167.522003 NaN
2022-01-05 164.356995 NaN
2022-01-06 163.253998 NaN
2022-01-07 162.554001 NaN
In [414]:
from matplotlib import rcParams
rcParams['figure.figsize'] = 10,6
plt.plot(data.AMZN)
plt.plot(data.GOOG)
plt.grid(True, color='k', linestyle=':')
plt.title("Amazon & Google Prices")
plt.xlabel("Date")
Out[414]:
Text(0.5, 0, 'Date')
No description has been provided for this image
In [416]:
from matplotlib import rcParams
rcParams['figure.figsize'] = 10,6
plt.plot(data.AMZN)
plt.plot(data.GOOG)
plt.grid(True, color='k', linestyle=':')
plt.title("Amazon & Google Prices")
plt.xlabel("Date")
plt.legend(['GOOG','AMZN'],loc =2)
Out[416]:
<matplotlib.legend.Legend at 0x264ba57f260>
No description has been provided for this image

1.8.1 Refer to below link for more details • https://matplotlib.org/3.1.1/gallery/style_sheets/style_sheets_reference.html • https://matplotlib.org/3.1.1/api/markers_api.html formore details

In [419]:
#loading the Dataset
tips = pd.read_csv('D:/docs/downloads/tips.csv')
In [421]:
tips.head()
Out[421]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

1.9 Boxplot

In [426]:
sns.boxplot(tips['size'])
Out[426]:
<Axes: ylabel='size'>
No description has been provided for this image
In [428]:
sns.boxplot(tips['total_bill'])
Out[428]:
<Axes: ylabel='total_bill'>
No description has been provided for this image
In [430]:
sns.boxplot(x='sex', y ='total_bill', data=tips)
Out[430]:
<Axes: xlabel='sex', ylabel='total_bill'>
No description has been provided for this image
In [432]:
sns.boxplot(x='day', y ='total_bill', data=tips)
Out[432]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [434]:
sns.boxplot(x='day', y ='total_bill', data=tips,hue='sex')
Out[434]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [440]:
sns.boxplot(x='day', y ='total_bill', data=tips,hue='sex',palette ='husl')
Out[440]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [442]:
sns.boxplot(x='day', y ='total_bill', data=tips,hue='smoker',palette='coolwarm')
Out[442]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [444]:
sns.boxplot(x='day', y ='total_bill', data=tips,hue='time')
Out[444]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image

1.10 Joint Distribution

In [449]:
iris = pd.read_csv('C:/Users/Acer/Downloads/iris.csv')
In [451]:
sns.jointplot(x='total_bill', y='tip', data=tips)
Out[451]:
<seaborn.axisgrid.JointGrid at 0x264bb1a0f80>
No description has been provided for this image
In [455]:
iris.columns
Out[455]:
Index(['150', '4', 'setosa', 'versicolor', 'virginica'], dtype='object')
In [457]:
# sns.jointplot(x='SepalLengthCm', y='SepalWidthCm', data=iris)
# sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg',color='green')
In [461]:
# adding regression lines
sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg')
Out[461]:
<seaborn.axisgrid.JointGrid at 0x264bdd90170>
No description has been provided for this image

1.11 Bar Plots

In [464]:
sns.barplot(x='day', y ='tip',data=tips)
Out[464]:
<Axes: xlabel='day', ylabel='tip'>
No description has been provided for this image
In [466]:
sns.barplot(x='day', y ='total_bill',data=tips)
Out[466]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [468]:
sns.barplot(x='day', y ='total_bill',data=tips,hue='sex')
Out[468]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [470]:
sns.barplot(x='day', y ='total_bill',data=tips,hue='sex', palette='winter_r')
Out[470]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [472]:
sns.barplot(x='day', y ='total_bill',data=tips,hue='smoker')
Out[472]:
<Axes: xlabel='day', ylabel='total_bill'>
No description has been provided for this image
In [474]:
sns.barplot(x='total_bill', y ='day' , data = tips, palette ='spring')
Out[474]:
<Axes: xlabel='total_bill', ylabel='day'>
No description has been provided for this image
In [478]:
sns.barplot(x='day', y ='tip' , data = tips, palette='spring',order=['Thur','Fri','Sat','Sun'])
Out[478]:
<Axes: xlabel='day', ylabel='tip'>
No description has been provided for this image
In [480]:
sns.barplot(x='smoker', y ='tip' , data = tips,hue='sex')
Out[480]:
<Axes: xlabel='smoker', ylabel='tip'>
No description has been provided for this image
In [482]:
# #Laod the dataset
# df = pd.read_csv('mtcars.csv')
In [486]:
iris = pd.read_csv('C:/Users/Acer/Downloads/iris.csv')
iris.head()
Out[486]:
150 4 setosa versicolor virginica
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
In [488]:
sns.pairplot(iris)
Out[488]:
<seaborn.axisgrid.PairGrid at 0x264bb16d760>
No description has been provided for this image
In [492]:
sns.pairplot(iris,hue='versicolor')
Out[492]:
<seaborn.axisgrid.PairGrid at 0x264b9e68470>
No description has been provided for this image
In [494]:
# sns.pairplot(iris,hue='Species')
In [498]:
# "D:\docs\downloads\cars.csv"
#Laod the dataset
df6 = pd.read_csv('D:/docs/downloads/cars.csv')
In [503]:
# Viewing first 5 observations
df.head()
Out[503]:
id Manufacturer Model Type Min.Price Price Max.Price MPG.city MPG.highway AirBags ... Passengers Length Wheelbase Width Turn.circle Rear.seat.room Luggage.room Weight Origin Make
0 1 Acura Integra Small 12.9 15.9 18.8 25 31 NaN ... 5 177 102 68 37 26.5 11.0 2705 non-USA Acura Integra
1 2 Acura Legend Midsize 29.2 33.9 38.7 18 25 Driver & Passenger ... 5 195 115 71 38 30.0 15.0 3560 non-USA Acura Legend
2 3 Audi 90 Compact 25.9 29.1 32.3 20 26 Driver only ... 5 180 102 67 37 28.0 14.0 3375 non-USA Audi 90
3 4 Audi 100 Midsize 30.8 37.7 44.6 19 26 NaN ... 6 193 106 70 37 31.0 17.0 3405 non-USA Audi 100
4 5 BMW 535i Midsize 23.7 30.0 36.2 22 30 Driver only ... 4 186 109 69 39 27.0 13.0 3640 non-USA BMW 535i

5 rows × 28 columns

In [511]:
df6.plot.scatter(x='Weight', y='MPG.highway')
Out[511]:
<Axes: xlabel='Weight', ylabel='MPG.highway'>
No description has been provided for this image
In [515]:
sns.regplot(x='Weight', y='MPG.highway', data=df6)
Out[515]:
<Axes: xlabel='Weight', ylabel='MPG.highway'>
No description has been provided for this image
In [517]:
#changing color
sns.regplot(x='Weight', y='MPG.highway',data=df6, color='g')
Out[517]:
<Axes: xlabel='Weight', ylabel='MPG.highway'>
No description has been provided for this image
In [519]:
sns.lmplot(x='Weight', y='MPG.highway',data=df6)
Out[519]:
<seaborn.axisgrid.FacetGrid at 0x264c3fd2c60>
No description has been provided for this image
In [525]:
sns.lmplot(x='Weight', y='MPG.highway',hue='Origin', data=df6)
Out[525]:
<seaborn.axisgrid.FacetGrid at 0x264c4fca390>
No description has been provided for this image
In [527]:
sns.lmplot(x='Weight', y='MPG.highway',hue='Origin',palette='Set1', data=df6)
Out[527]:
<seaborn.axisgrid.FacetGrid at 0x264c3fc2ff0>
No description has been provided for this image
In [529]:
sns.lmplot(x='Weight', y='MPG.highway',hue='Origin',markers=['+','o'],palette='Set1', data=df6)
Out[529]:
<seaborn.axisgrid.FacetGrid at 0x264c5154440>
No description has been provided for this image
In [533]:
# df7 = pd.read_csv('')
In [535]:
# df7.columns
Out[535]:
Index(['A_id', 'Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness',
       'Ripeness', 'Acidity', 'Quality'],
      dtype='object')
In [537]:
# df7.head()
Out[537]:
A_id Size Weight Sweetness Crunchiness Juiciness Ripeness Acidity Quality
0 0.0 -3.970049 -2.512336 5.346330 -1.012009 1.844900 0.329840 -0.491590483 good
1 1.0 -1.195217 -2.839257 3.664059 1.588232 0.853286 0.867530 -0.722809367 good
2 2.0 -0.292024 -1.351282 -1.738429 -0.342616 2.838636 -0.038033 2.621636473 bad
3 3.0 -0.657196 -2.271627 1.324874 -0.097875 3.637970 -3.413761 0.790723217 good
4 4.0 1.364217 -1.296612 -0.384658 -0.553006 3.030874 -1.303849 0.501984036 good
In [539]:
df7.dtypes
Out[539]:
A_id           float64
Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity         object
Quality         object
dtype: object
In [543]:
# sns.heatmap(df7.corr())
In [547]:
# sns.heatmap(df.corr(), annot =True)
In [551]:
# plt.figure(figsize=(10,5))
# sns.heatmap(df.corr(), annot =True)

1.12 Refer to below Link for the datasets: https://drive.google.com/drive/folders/1QZxDigr3kJCTjtDBRvbIujSsiR9E3Nak?usp=sharing 1.13 Refer to below link for “How to choose the right chart” https://towardsdatascience.com/data-visualization-how-to-choose-the-right-chart-part-1- d4c550085ea7

In [ ]: