import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#reading file
df1 = pd.read_excel('C:/Users/Acer/Downloads/Project/SuperStore Sales DataSet.xlsx')
df1.head()
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Category | Sub-Category | Product Name | Sales | Quantity | Profit | Returns | Payment Mode | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4918 | CA-2019-160304 | 2019-01-01 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 73.94 | 1 | 28.2668 | NaN | Online | NaN | NaN |
| 1 | 4919 | CA-2019-160304 | 2019-01-02 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 173.94 | 3 | 38.2668 | NaN | Online | NaN | NaN |
| 2 | 4920 | CA-2019-160304 | 2019-01-02 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | ... | Technology | Phones | GE 30522EE2 | 231.98 | 2 | 67.2742 | NaN | Cards | NaN | NaN |
| 3 | 3074 | CA-2019-125206 | 2019-01-03 | 2019-01-05 | First Class | LR-16915 | Lena Radford | Consumer | United States | Los Angeles | ... | Office Supplies | Storage | Recycled Steel Personal File for Hanging File ... | 114.46 | 2 | 28.6150 | NaN | Online | NaN | NaN |
| 4 | 8604 | US-2019-116365 | 2019-01-03 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | ... | Technology | Accessories | Imation Clip USB flash drive - 8 GB | 30.08 | 2 | -5.2640 | NaN | Online | NaN | NaN |
5 rows × 23 columns
#Dropping Multiple Rows
df1.drop('Row ID',axis=1,inplace=True)
df1.head()
| Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | ... | Category | Sub-Category | Product Name | Sales | Quantity | Profit | Returns | Payment Mode | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CA-2019-160304 | 2019-01-01 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 73.94 | 1 | 28.2668 | NaN | Online | NaN | NaN |
| 1 | CA-2019-160304 | 2019-01-02 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 173.94 | 3 | 38.2668 | NaN | Online | NaN | NaN |
| 2 | CA-2019-160304 | 2019-01-02 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | ... | Technology | Phones | GE 30522EE2 | 231.98 | 2 | 67.2742 | NaN | Cards | NaN | NaN |
| 3 | CA-2019-125206 | 2019-01-03 | 2019-01-05 | First Class | LR-16915 | Lena Radford | Consumer | United States | Los Angeles | California | ... | Office Supplies | Storage | Recycled Steel Personal File for Hanging File ... | 114.46 | 2 | 28.6150 | NaN | Online | NaN | NaN |
| 4 | US-2019-116365 | 2019-01-03 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | ... | Technology | Accessories | Imation Clip USB flash drive - 8 GB | 30.08 | 2 | -5.2640 | NaN | Online | NaN | NaN |
5 rows × 22 columns
# Dropping the multiple columns togethor
df1.drop(['Order Date'],axis=1,inplace=True)
df1.head()
| Order ID | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Region | ... | Category | Sub-Category | Product Name | Sales | Quantity | Profit | Returns | Payment Mode | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CA-2019-160304 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | East | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 73.94 | 1 | 28.2668 | NaN | Online | NaN | NaN |
| 1 | CA-2019-160304 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | East | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 173.94 | 3 | 38.2668 | NaN | Online | NaN | NaN |
| 2 | CA-2019-160304 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | East | ... | Technology | Phones | GE 30522EE2 | 231.98 | 2 | 67.2742 | NaN | Cards | NaN | NaN |
| 3 | CA-2019-125206 | 2019-01-05 | First Class | LR-16915 | Lena Radford | Consumer | United States | Los Angeles | California | West | ... | Office Supplies | Storage | Recycled Steel Personal File for Hanging File ... | 114.46 | 2 | 28.6150 | NaN | Online | NaN | NaN |
| 4 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Accessories | Imation Clip USB flash drive - 8 GB | 30.08 | 2 | -5.2640 | NaN | Online | NaN | NaN |
5 rows × 21 columns
# Drop Rows (axis=0 for rows)
df1.drop(1,axis=0,inplace=True)
df1.head()
| Order ID | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Region | ... | Category | Sub-Category | Product Name | Sales | Quantity | Profit | Returns | Payment Mode | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CA-2019-160304 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | East | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 73.94 | 1 | 28.2668 | NaN | Online | NaN | NaN |
| 2 | CA-2019-160304 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | East | ... | Technology | Phones | GE 30522EE2 | 231.98 | 2 | 67.2742 | NaN | Cards | NaN | NaN |
| 3 | CA-2019-125206 | 2019-01-05 | First Class | LR-16915 | Lena Radford | Consumer | United States | Los Angeles | California | West | ... | Office Supplies | Storage | Recycled Steel Personal File for Hanging File ... | 114.46 | 2 | 28.6150 | NaN | Online | NaN | NaN |
| 4 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Accessories | Imation Clip USB flash drive - 8 GB | 30.08 | 2 | -5.2640 | NaN | Online | NaN | NaN |
| 5 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Accessories | WD My Passport Ultra 1TB Portable External Har... | 165.60 | 3 | -6.2100 | NaN | Online | NaN | NaN |
5 rows × 21 columns
# Dropping Multiple Rows
df1.drop([2,3],axis =0,inplace=True)
df1.head()
| Order ID | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Region | ... | Category | Sub-Category | Product Name | Sales | Quantity | Profit | Returns | Payment Mode | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CA-2019-160304 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | East | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 73.94 | 1 | 28.2668 | NaN | Online | NaN | NaN |
| 4 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Accessories | Imation Clip USB flash drive - 8 GB | 30.08 | 2 | -5.2640 | NaN | Online | NaN | NaN |
| 5 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Accessories | WD My Passport Ultra 1TB Portable External Har... | 165.60 | 3 | -6.2100 | NaN | Online | NaN | NaN |
| 6 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Phones | AT&T 17929 Lendline Telephone | 180.96 | 5 | 13.5720 | NaN | Cards | NaN | NaN |
| 7 | CA-2019-105207 | 2019-01-08 | Standard Class | BO-11350 | Bill Overfelt | Corporate | United States | Broken Arrow | Oklahoma | Central | ... | Furniture | Tables | Hon Practical Foundations 30 x 60 Training Tab... | 1592.85 | 7 | 350.4270 | NaN | COD | NaN | NaN |
5 rows × 21 columns
import random
cities = ["Faridabad", "Delhi", "Noida", "Mumbai", "Bangalore", "Chennai", "Hyderabad", "Pune", "Jaipur", "Kolkata"]
ranks = ["1st", "2nd", "3rd", "4th", "5th"]
raw_data = {
"city": [random.choice(cities) for _ in range(60)],
"rank": [random.choice(ranks) for _ in range(60)],
"score1": [random.randint(30, 80) for _ in range(60)],
"score2": [random.randint(50, 100) for _ in range(60)]
}
print(raw_data)
df2=pd.DataFrame(raw_data,columns=["city","rank","score1","score2"])
{'city': ['Delhi', 'Bangalore', 'Hyderabad', 'Pune', 'Bangalore', 'Pune', 'Kolkata', 'Faridabad', 'Hyderabad', 'Hyderabad', 'Hyderabad', 'Hyderabad', 'Pune', 'Faridabad', 'Kolkata', 'Bangalore', 'Hyderabad', 'Pune', 'Delhi', 'Kolkata', 'Jaipur', 'Kolkata', 'Mumbai', 'Jaipur', 'Noida', 'Bangalore', 'Bangalore', 'Chennai', 'Jaipur', 'Jaipur', 'Jaipur', 'Mumbai', 'Jaipur', 'Kolkata', 'Kolkata', 'Mumbai', 'Delhi', 'Pune', 'Chennai', 'Hyderabad', 'Mumbai', 'Bangalore', 'Pune', 'Jaipur', 'Jaipur', 'Bangalore', 'Faridabad', 'Hyderabad', 'Chennai', 'Kolkata', 'Mumbai', 'Faridabad', 'Delhi', 'Mumbai', 'Chennai', 'Faridabad', 'Faridabad', 'Noida', 'Faridabad', 'Kolkata'], 'rank': ['3rd', '1st', '1st', '2nd', '5th', '3rd', '3rd', '4th', '5th', '4th', '4th', '5th', '1st', '5th', '1st', '2nd', '1st', '1st', '1st', '3rd', '3rd', '2nd', '3rd', '2nd', '2nd', '4th', '4th', '5th', '1st', '4th', '5th', '1st', '3rd', '1st', '5th', '1st', '4th', '2nd', '2nd', '3rd', '4th', '2nd', '4th', '5th', '1st', '3rd', '2nd', '1st', '5th', '3rd', '1st', '3rd', '2nd', '2nd', '1st', '3rd', '3rd', '3rd', '5th', '5th'], 'score1': [50, 74, 59, 67, 63, 48, 32, 48, 54, 80, 72, 75, 61, 58, 70, 35, 40, 51, 47, 77, 76, 37, 45, 41, 30, 70, 60, 41, 61, 73, 41, 32, 66, 71, 70, 40, 57, 66, 43, 65, 61, 73, 65, 39, 52, 72, 67, 40, 70, 44, 80, 74, 79, 58, 62, 32, 36, 52, 43, 38], 'score2': [53, 80, 90, 51, 73, 100, 89, 65, 72, 84, 98, 67, 54, 100, 51, 87, 100, 67, 86, 94, 57, 54, 59, 66, 70, 82, 76, 92, 65, 76, 66, 97, 65, 63, 65, 54, 83, 94, 96, 82, 76, 84, 56, 69, 88, 53, 62, 55, 51, 59, 74, 62, 85, 51, 88, 96, 76, 55, 52, 76]}
df2
| city | rank | score1 | score2 | |
|---|---|---|---|---|
| 0 | Delhi | 3rd | 50 | 53 |
| 1 | Bangalore | 1st | 74 | 80 |
| 2 | Hyderabad | 1st | 59 | 90 |
| 3 | Pune | 2nd | 67 | 51 |
| 4 | Bangalore | 5th | 63 | 73 |
| 5 | Pune | 3rd | 48 | 100 |
| 6 | Kolkata | 3rd | 32 | 89 |
| 7 | Faridabad | 4th | 48 | 65 |
| 8 | Hyderabad | 5th | 54 | 72 |
| 9 | Hyderabad | 4th | 80 | 84 |
| 10 | Hyderabad | 4th | 72 | 98 |
| 11 | Hyderabad | 5th | 75 | 67 |
| 12 | Pune | 1st | 61 | 54 |
| 13 | Faridabad | 5th | 58 | 100 |
| 14 | Kolkata | 1st | 70 | 51 |
| 15 | Bangalore | 2nd | 35 | 87 |
| 16 | Hyderabad | 1st | 40 | 100 |
| 17 | Pune | 1st | 51 | 67 |
| 18 | Delhi | 1st | 47 | 86 |
| 19 | Kolkata | 3rd | 77 | 94 |
| 20 | Jaipur | 3rd | 76 | 57 |
| 21 | Kolkata | 2nd | 37 | 54 |
| 22 | Mumbai | 3rd | 45 | 59 |
| 23 | Jaipur | 2nd | 41 | 66 |
| 24 | Noida | 2nd | 30 | 70 |
| 25 | Bangalore | 4th | 70 | 82 |
| 26 | Bangalore | 4th | 60 | 76 |
| 27 | Chennai | 5th | 41 | 92 |
| 28 | Jaipur | 1st | 61 | 65 |
| 29 | Jaipur | 4th | 73 | 76 |
| 30 | Jaipur | 5th | 41 | 66 |
| 31 | Mumbai | 1st | 32 | 97 |
| 32 | Jaipur | 3rd | 66 | 65 |
| 33 | Kolkata | 1st | 71 | 63 |
| 34 | Kolkata | 5th | 70 | 65 |
| 35 | Mumbai | 1st | 40 | 54 |
| 36 | Delhi | 4th | 57 | 83 |
| 37 | Pune | 2nd | 66 | 94 |
| 38 | Chennai | 2nd | 43 | 96 |
| 39 | Hyderabad | 3rd | 65 | 82 |
| 40 | Mumbai | 4th | 61 | 76 |
| 41 | Bangalore | 2nd | 73 | 84 |
| 42 | Pune | 4th | 65 | 56 |
| 43 | Jaipur | 5th | 39 | 69 |
| 44 | Jaipur | 1st | 52 | 88 |
| 45 | Bangalore | 3rd | 72 | 53 |
| 46 | Faridabad | 2nd | 67 | 62 |
| 47 | Hyderabad | 1st | 40 | 55 |
| 48 | Chennai | 5th | 70 | 51 |
| 49 | Kolkata | 3rd | 44 | 59 |
| 50 | Mumbai | 1st | 80 | 74 |
| 51 | Faridabad | 3rd | 74 | 62 |
| 52 | Delhi | 2nd | 79 | 85 |
| 53 | Mumbai | 2nd | 58 | 51 |
| 54 | Chennai | 1st | 62 | 88 |
| 55 | Faridabad | 3rd | 32 | 96 |
| 56 | Faridabad | 3rd | 36 | 76 |
| 57 | Noida | 3rd | 52 | 55 |
| 58 | Faridabad | 5th | 43 | 52 |
| 59 | Kolkata | 5th | 38 | 76 |
#check for duplicate data
df2.duplicated()
#So, thereisno duplicate row in a dataframe
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False 15 False 16 False 17 False 18 False 19 False 20 False 21 False 22 False 23 False 24 False 25 False 26 False 27 False 28 False 29 False 30 False 31 False 32 False 33 False 34 False 35 False 36 False 37 False 38 False 39 False 40 False 41 False 42 False 43 False 44 False 45 False 46 False 47 False 48 False 49 False 50 False 51 False 52 False 53 False 54 False 55 False 56 False 57 False 58 False 59 False dtype: bool
#check for duplicate rows in city
df2.duplicated(['city'])
#the first occurrences of data also treated as False
0 False 1 False 2 False 3 False 4 True 5 True 6 False 7 False 8 True 9 True 10 True 11 True 12 True 13 True 14 True 15 True 16 True 17 True 18 True 19 True 20 False 21 True 22 False 23 True 24 False 25 True 26 True 27 False 28 True 29 True 30 True 31 True 32 True 33 True 34 True 35 True 36 True 37 True 38 True 39 True 40 True 41 True 42 True 43 True 44 True 45 True 46 True 47 True 48 True 49 True 50 True 51 True 52 True 53 True 54 True 55 True 56 True 57 True 58 True 59 True dtype: bool
df2.duplicated(['rank'])
#the first occurrences of data also treated as False
0 False 1 False 2 True 3 False 4 False 5 True 6 True 7 False 8 True 9 True 10 True 11 True 12 True 13 True 14 True 15 True 16 True 17 True 18 True 19 True 20 True 21 True 22 True 23 True 24 True 25 True 26 True 27 True 28 True 29 True 30 True 31 True 32 True 33 True 34 True 35 True 36 True 37 True 38 True 39 True 40 True 41 True 42 True 43 True 44 True 45 True 46 True 47 True 48 True 49 True 50 True 51 True 52 True 53 True 54 True 55 True 56 True 57 True 58 True 59 True dtype: bool
df2.duplicated(['rank'],keep='last')
# the last occurances is treated as True, when we classify keep as last
0 True 1 True 2 True 3 True 4 True 5 True 6 True 7 True 8 True 9 True 10 True 11 True 12 True 13 True 14 True 15 True 16 True 17 True 18 True 19 True 20 True 21 True 22 True 23 True 24 True 25 True 26 True 27 True 28 True 29 True 30 True 31 True 32 True 33 True 34 True 35 True 36 True 37 True 38 True 39 True 40 True 41 True 42 False 43 True 44 True 45 True 46 True 47 True 48 True 49 True 50 True 51 True 52 True 53 False 54 False 55 True 56 True 57 False 58 True 59 False dtype: bool
#checking duplicate values on the basis of combinations
df2.duplicated(['city','rank'])
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 True 11 True 12 False 13 False 14 False 15 False 16 True 17 True 18 False 19 True 20 False 21 False 22 False 23 False 24 False 25 False 26 True 27 False 28 False 29 False 30 False 31 False 32 True 33 True 34 False 35 True 36 False 37 True 38 False 39 False 40 False 41 True 42 False 43 True 44 True 45 False 46 False 47 True 48 True 49 True 50 True 51 False 52 False 53 False 54 False 55 True 56 True 57 False 58 True 59 True dtype: bool
df2.drop_duplicates()
# No rows are dropped because there is no duplicate rows
| city | rank | score1 | score2 | |
|---|---|---|---|---|
| 0 | Delhi | 3rd | 50 | 53 |
| 1 | Bangalore | 1st | 74 | 80 |
| 2 | Hyderabad | 1st | 59 | 90 |
| 3 | Pune | 2nd | 67 | 51 |
| 4 | Bangalore | 5th | 63 | 73 |
| 5 | Pune | 3rd | 48 | 100 |
| 6 | Kolkata | 3rd | 32 | 89 |
| 7 | Faridabad | 4th | 48 | 65 |
| 8 | Hyderabad | 5th | 54 | 72 |
| 9 | Hyderabad | 4th | 80 | 84 |
| 10 | Hyderabad | 4th | 72 | 98 |
| 11 | Hyderabad | 5th | 75 | 67 |
| 12 | Pune | 1st | 61 | 54 |
| 13 | Faridabad | 5th | 58 | 100 |
| 14 | Kolkata | 1st | 70 | 51 |
| 15 | Bangalore | 2nd | 35 | 87 |
| 16 | Hyderabad | 1st | 40 | 100 |
| 17 | Pune | 1st | 51 | 67 |
| 18 | Delhi | 1st | 47 | 86 |
| 19 | Kolkata | 3rd | 77 | 94 |
| 20 | Jaipur | 3rd | 76 | 57 |
| 21 | Kolkata | 2nd | 37 | 54 |
| 22 | Mumbai | 3rd | 45 | 59 |
| 23 | Jaipur | 2nd | 41 | 66 |
| 24 | Noida | 2nd | 30 | 70 |
| 25 | Bangalore | 4th | 70 | 82 |
| 26 | Bangalore | 4th | 60 | 76 |
| 27 | Chennai | 5th | 41 | 92 |
| 28 | Jaipur | 1st | 61 | 65 |
| 29 | Jaipur | 4th | 73 | 76 |
| 30 | Jaipur | 5th | 41 | 66 |
| 31 | Mumbai | 1st | 32 | 97 |
| 32 | Jaipur | 3rd | 66 | 65 |
| 33 | Kolkata | 1st | 71 | 63 |
| 34 | Kolkata | 5th | 70 | 65 |
| 35 | Mumbai | 1st | 40 | 54 |
| 36 | Delhi | 4th | 57 | 83 |
| 37 | Pune | 2nd | 66 | 94 |
| 38 | Chennai | 2nd | 43 | 96 |
| 39 | Hyderabad | 3rd | 65 | 82 |
| 40 | Mumbai | 4th | 61 | 76 |
| 41 | Bangalore | 2nd | 73 | 84 |
| 42 | Pune | 4th | 65 | 56 |
| 43 | Jaipur | 5th | 39 | 69 |
| 44 | Jaipur | 1st | 52 | 88 |
| 45 | Bangalore | 3rd | 72 | 53 |
| 46 | Faridabad | 2nd | 67 | 62 |
| 47 | Hyderabad | 1st | 40 | 55 |
| 48 | Chennai | 5th | 70 | 51 |
| 49 | Kolkata | 3rd | 44 | 59 |
| 50 | Mumbai | 1st | 80 | 74 |
| 51 | Faridabad | 3rd | 74 | 62 |
| 52 | Delhi | 2nd | 79 | 85 |
| 53 | Mumbai | 2nd | 58 | 51 |
| 54 | Chennai | 1st | 62 | 88 |
| 55 | Faridabad | 3rd | 32 | 96 |
| 56 | Faridabad | 3rd | 36 | 76 |
| 57 | Noida | 3rd | 52 | 55 |
| 58 | Faridabad | 5th | 43 | 52 |
| 59 | Kolkata | 5th | 38 | 76 |
df2.drop_duplicates(['city'])
#all the duplicate cities are dropped
| city | rank | score1 | score2 | |
|---|---|---|---|---|
| 0 | Delhi | 3rd | 50 | 53 |
| 1 | Bangalore | 1st | 74 | 80 |
| 2 | Hyderabad | 1st | 59 | 90 |
| 3 | Pune | 2nd | 67 | 51 |
| 6 | Kolkata | 3rd | 32 | 89 |
| 7 | Faridabad | 4th | 48 | 65 |
| 20 | Jaipur | 3rd | 76 | 57 |
| 22 | Mumbai | 3rd | 45 | 59 |
| 24 | Noida | 2nd | 30 | 70 |
| 27 | Chennai | 5th | 41 | 92 |
df2.drop_duplicates(['city','rank'])
# all the duplicate rows are dropped on the basis of combination of city and␣rank column
| city | rank | score1 | score2 | |
|---|---|---|---|---|
| 0 | Delhi | 3rd | 50 | 53 |
| 1 | Bangalore | 1st | 74 | 80 |
| 2 | Hyderabad | 1st | 59 | 90 |
| 3 | Pune | 2nd | 67 | 51 |
| 4 | Bangalore | 5th | 63 | 73 |
| 5 | Pune | 3rd | 48 | 100 |
| 6 | Kolkata | 3rd | 32 | 89 |
| 7 | Faridabad | 4th | 48 | 65 |
| 8 | Hyderabad | 5th | 54 | 72 |
| 9 | Hyderabad | 4th | 80 | 84 |
| 12 | Pune | 1st | 61 | 54 |
| 13 | Faridabad | 5th | 58 | 100 |
| 14 | Kolkata | 1st | 70 | 51 |
| 15 | Bangalore | 2nd | 35 | 87 |
| 18 | Delhi | 1st | 47 | 86 |
| 20 | Jaipur | 3rd | 76 | 57 |
| 21 | Kolkata | 2nd | 37 | 54 |
| 22 | Mumbai | 3rd | 45 | 59 |
| 23 | Jaipur | 2nd | 41 | 66 |
| 24 | Noida | 2nd | 30 | 70 |
| 25 | Bangalore | 4th | 70 | 82 |
| 27 | Chennai | 5th | 41 | 92 |
| 28 | Jaipur | 1st | 61 | 65 |
| 29 | Jaipur | 4th | 73 | 76 |
| 30 | Jaipur | 5th | 41 | 66 |
| 31 | Mumbai | 1st | 32 | 97 |
| 34 | Kolkata | 5th | 70 | 65 |
| 36 | Delhi | 4th | 57 | 83 |
| 38 | Chennai | 2nd | 43 | 96 |
| 39 | Hyderabad | 3rd | 65 | 82 |
| 40 | Mumbai | 4th | 61 | 76 |
| 42 | Pune | 4th | 65 | 56 |
| 45 | Bangalore | 3rd | 72 | 53 |
| 46 | Faridabad | 2nd | 67 | 62 |
| 51 | Faridabad | 3rd | 74 | 62 |
| 52 | Delhi | 2nd | 79 | 85 |
| 53 | Mumbai | 2nd | 58 | 51 |
| 54 | Chennai | 1st | 62 | 88 |
| 57 | Noida | 3rd | 52 | 55 |
df.describe()
| Row ID | Ship Date | Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|
| count | 9994.000000 | 9994 | 9983.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 |
| mean | 4997.500000 | 2021-05-04 04:17:20.304182528 | 55245.233297 | 229.858001 | 3.789574 | 0.156203 | 28.656896 |
| min | 1.000000 | 2019-01-07 00:00:00 | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.978000 |
| 25% | 2499.250000 | 2020-05-27 00:00:00 | 23223.000000 | 17.280000 | 2.000000 | 0.000000 | 1.728750 |
| 50% | 4997.500000 | 2021-06-29 00:00:00 | 57103.000000 | 54.490000 | 3.000000 | 0.200000 | 8.666500 |
| 75% | 7495.750000 | 2022-05-18 00:00:00 | 90008.000000 | 209.940000 | 5.000000 | 0.200000 | 29.364000 |
| max | 9994.000000 | 2023-01-05 00:00:00 | 99301.000000 | 22638.480000 | 14.000000 | 0.800000 | 8399.976000 |
| std | 2885.163629 | NaN | 32038.715955 | 623.245101 | 2.225110 | 0.206452 | 234.260108 |
1.1 What is an outlier ?
• A data point which is significantly far away from other data points
• IQR (Inter Quartile Range)
• IQR = Q3 - Q1
• Lower Boundary = Q1 - (1.5 * IQR)
• Upper Boundary = Q3 + (1.5 * IQR)
df1.head()
| Order ID | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Region | ... | Category | Sub-Category | Product Name | Sales | Quantity | Profit | Returns | Payment Mode | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CA-2019-160304 | 2019-01-07 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | Maryland | East | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 73.94 | 1 | 28.2668 | NaN | Online | NaN | NaN |
| 4 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Accessories | Imation Clip USB flash drive - 8 GB | 30.08 | 2 | -5.2640 | NaN | Online | NaN | NaN |
| 5 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Accessories | WD My Passport Ultra 1TB Portable External Har... | 165.60 | 3 | -6.2100 | NaN | Online | NaN | NaN |
| 6 | US-2019-116365 | 2019-01-08 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | Texas | Central | ... | Technology | Phones | AT&T 17929 Lendline Telephone | 180.96 | 5 | 13.5720 | NaN | Cards | NaN | NaN |
| 7 | CA-2019-105207 | 2019-01-08 | Standard Class | BO-11350 | Bill Overfelt | Corporate | United States | Broken Arrow | Oklahoma | Central | ... | Furniture | Tables | Hon Practical Foundations 30 x 60 Training Tab... | 1592.85 | 7 | 350.4270 | NaN | COD | NaN | NaN |
5 rows × 21 columns
df1['Profit'].describe()
count 5898.000000 mean 29.692769 std 259.654675 min -6599.978000 25% 1.794375 50% 8.502250 75% 28.436400 max 8399.976000 Name: Profit, dtype: float64
import seaborn as sns
sns.boxplot(x=df1['Profit'])
<Axes: xlabel='Profit'>
sns.boxplot(x=df1['Sales'])
<Axes: xlabel='Sales'>
sns.boxplot(x=df1['Quantity'])
<Axes: xlabel='Quantity'>
df1.shape
(5898, 21)
# Let's calculate first quartile,third_quartile and inter_quartile range
first_quartile = df1['Profit'].quantile(.25)
third_quartile = df1['Profit'].quantile(.75)
IQR = third_quartile - first_quartile
new_boundary = third_quartile + 3 * IQR
new_boundary = third_quartile + 3 * IQR
# dropped the outliers data
new_data = df1.drop(df1[df1['Profit']>new_boundary].index, axis = 0,inplace=False)
# 12 rows are dropped
new_data.shape
(5423, 21)
sns.boxplot(x=new_data['Profit'])
<Axes: xlabel='Profit'>
# Let's calculate first quartile,third_quartile and inter_quartile range
first_quartile1 = df1['Quantity'].quantile(.25)
third_quartile1 = df1['Quantity'].quantile(.75)
IQR = third_quartile - first_quartile
new_boundary1 = third_quartile1 + 3 * IQR
# dropped the outliers data
new_data1 = df1.drop(df1[df1['Quantity']>new_boundary1].index, axis = 0,inplace=False)
# 12 rows are dropped
new_data1.shape
(5898, 21)
sns.boxplot(x=new_data1['Quantity'])
<Axes: xlabel='Quantity'>
1.2 Outliers detection using IQR¶
import random
# List of names (50 names)
names = [
'Mohan', 'Maria', 'Deepak', 'Kunal', 'Piyush', 'Avinash', 'Lisa', 'Smita', 'Tanu', 'Khusboo',
'Nishant', 'Johnson', 'Donald', 'Rakesh', 'Pritvi', 'Roy', 'Ashish', 'Abhishek', 'Jassi', 'Nikita',
'Aman', 'Divya', 'Sneha', 'Ritu', 'Anuj', 'Chetan', 'Saurabh', 'Rohit', 'Gaurav', 'Meena',
'Seema', 'Priya', 'Alok', 'Sanya', 'Nilesh', 'Vikas', 'Ankita', 'Suraj', 'Bhavna', 'Kiran',
'Simran', 'Radhika', 'Vivek', 'Pranav', 'Hemant', 'Aditi', 'Komal', 'Tarun', 'Neha', 'Ishaan'
]
# Generate random realistic heights between 1.4m and 2.1m
heights = [round(random.uniform(1.4, 2.1), 2) for _ in range(50)]
# Create the DataFrame
raw = {
'name': names,
'height': heights
}
df3 = pd.DataFrame(raw)
df3
| name | height | |
|---|---|---|
| 0 | Mohan | 1.82 |
| 1 | Maria | 1.78 |
| 2 | Deepak | 1.77 |
| 3 | Kunal | 1.77 |
| 4 | Piyush | 1.60 |
| 5 | Avinash | 1.89 |
| 6 | Lisa | 1.82 |
| 7 | Smita | 1.89 |
| 8 | Tanu | 1.89 |
| 9 | Khusboo | 1.45 |
| 10 | Nishant | 1.95 |
| 11 | Johnson | 1.42 |
| 12 | Donald | 2.01 |
| 13 | Rakesh | 2.01 |
| 14 | Pritvi | 1.44 |
| 15 | Roy | 1.92 |
| 16 | Ashish | 1.71 |
| 17 | Abhishek | 1.84 |
| 18 | Jassi | 1.65 |
| 19 | Nikita | 1.61 |
| 20 | Aman | 1.78 |
| 21 | Divya | 1.52 |
| 22 | Sneha | 1.88 |
| 23 | Ritu | 1.78 |
| 24 | Anuj | 1.73 |
| 25 | Chetan | 1.50 |
| 26 | Saurabh | 1.57 |
| 27 | Rohit | 2.09 |
| 28 | Gaurav | 1.43 |
| 29 | Meena | 1.92 |
| 30 | Seema | 1.50 |
| 31 | Priya | 1.85 |
| 32 | Alok | 1.89 |
| 33 | Sanya | 1.59 |
| 34 | Nilesh | 1.93 |
| 35 | Vikas | 1.67 |
| 36 | Ankita | 1.76 |
| 37 | Suraj | 2.10 |
| 38 | Bhavna | 1.48 |
| 39 | Kiran | 2.09 |
| 40 | Simran | 1.52 |
| 41 | Radhika | 1.42 |
| 42 | Vivek | 1.52 |
| 43 | Pranav | 1.92 |
| 44 | Hemant | 1.54 |
| 45 | Aditi | 1.83 |
| 46 | Komal | 1.99 |
| 47 | Tarun | 1.48 |
| 48 | Neha | 1.73 |
| 49 | Ishaan | 1.92 |
df3.describe()
| height | |
|---|---|
| count | 50.00000 |
| mean | 1.74340 |
| std | 0.20022 |
| min | 1.42000 |
| 25% | 1.54750 |
| 50% | 1.77500 |
| 75% | 1.89000 |
| max | 2.10000 |
import pandas as pd
import random
import numpy as np
# Names (50 different names)
names = [
'Mohan', 'Maria', 'Deepak', 'Kunal', 'Piyush', 'Avinash', 'Lisa', 'Smita', 'Tanu', 'Khusboo',
'Nishant', 'Johnson', 'Donald', 'Rakesh', 'Pritvi', 'Roy', 'Ashish', 'Abhishek', 'Jassi', 'Nikita',
'Aman', 'Divya', 'Sneha', 'Ritu', 'Anuj', 'Chetan', 'Saurabh', 'Rohit', 'Gaurav', 'Meena',
'Seema', 'Priya', 'Alok', 'Sanya', 'Nilesh', 'Vikas', 'Ankita', 'Suraj', 'Bhavna', 'Kiran',
'Simran', 'Radhika', 'Vivek', 'Pranav', 'Hemant', 'Aditi', 'Komal', 'Tarun', 'Neha', 'Ishaan'
]
# Generate 45 "normal" heights around 5.5-6.5 range
normal_heights = [round(random.uniform(4.5, 7.0), 2) for _ in range(45)]
# Add 5 "outlier" very large heights
outliers = [round(random.uniform(12.0, 40.0), 2) for _ in range(5)]
# Combine and shuffle
heights = normal_heights + outliers
random.shuffle(heights)
# Create DataFrame
raw = {
'name': names,
'height': heights
}
df4 = pd.DataFrame(raw)
df4.describe()
| height | |
|---|---|
| count | 50.000000 |
| mean | 8.324000 |
| std | 7.869812 |
| min | 4.620000 |
| 25% | 5.370000 |
| 50% | 5.830000 |
| 75% | 6.480000 |
| max | 37.800000 |
# Calculating the Q1 ,Q3
Q1 = df4.height.quantile(.25)
Q3 = df4.height.quantile(.75)
Q1 , Q3
(5.37, 6.4799999999999995)
# Calculating IQR
IQR = Q3 - Q1
IQR
1.1099999999999994
# Calculating IQR
lower_limit = Q1 - 1.5 * (IQR)
upper_limit = Q3 + 1.5 * (IQR)
lower_limit,upper_limit
(3.705000000000001, 8.145)
#outliers
df4[(df4.height<lower_limit)|(df4.height>upper_limit)]
| name | height | |
|---|---|---|
| 17 | Abhishek | 34.14 |
| 31 | Priya | 23.60 |
| 34 | Nilesh | 37.80 |
| 42 | Vivek | 26.07 |
| 45 | Aditi | 33.73 |
# data with no outliers
df_no_outliers = df4[(df4.height>lower_limit) & (df4.height < upper_limit) ]
df_no_outliers
| name | height | |
|---|---|---|
| 0 | Mohan | 6.83 |
| 1 | Maria | 4.77 |
| 2 | Deepak | 6.01 |
| 3 | Kunal | 6.28 |
| 4 | Piyush | 5.76 |
| 5 | Avinash | 6.84 |
| 6 | Lisa | 5.90 |
| 7 | Smita | 6.96 |
| 8 | Tanu | 6.82 |
| 9 | Khusboo | 6.36 |
| 10 | Nishant | 6.96 |
| 11 | Johnson | 5.10 |
| 12 | Donald | 5.37 |
| 13 | Rakesh | 5.90 |
| 14 | Pritvi | 4.83 |
| 15 | Roy | 6.35 |
| 16 | Ashish | 5.82 |
| 18 | Jassi | 5.27 |
| 19 | Nikita | 5.72 |
| 20 | Aman | 5.82 |
| 21 | Divya | 4.62 |
| 22 | Sneha | 5.70 |
| 23 | Ritu | 4.80 |
| 24 | Anuj | 6.32 |
| 25 | Chetan | 5.37 |
| 26 | Saurabh | 5.94 |
| 27 | Rohit | 5.65 |
| 28 | Gaurav | 5.45 |
| 29 | Meena | 4.78 |
| 30 | Seema | 5.90 |
| 32 | Alok | 5.19 |
| 33 | Sanya | 5.73 |
| 35 | Vikas | 5.69 |
| 36 | Ankita | 5.39 |
| 37 | Suraj | 6.52 |
| 38 | Bhavna | 5.84 |
| 39 | Kiran | 5.36 |
| 40 | Simran | 6.33 |
| 41 | Radhika | 5.67 |
| 43 | Pranav | 5.08 |
| 44 | Hemant | 5.09 |
| 46 | Komal | 6.08 |
| 47 | Tarun | 5.32 |
| 48 | Neha | 6.84 |
| 49 | Ishaan | 6.53 |
df5 =pd.read_csv("D:/docs/dataset/archive/diabetes.csv")
df5.sample(5)
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 212 | 7 | 179 | 95 | 31 | 0 | 34.2 | 0.164 | 60 | 0 |
| 499 | 6 | 154 | 74 | 32 | 193 | 29.3 | 0.839 | 39 | 0 |
| 488 | 4 | 99 | 72 | 17 | 0 | 25.6 | 0.294 | 28 | 0 |
| 268 | 0 | 102 | 52 | 0 | 0 | 25.1 | 0.078 | 21 | 0 |
| 454 | 2 | 100 | 54 | 28 | 105 | 37.8 | 0.498 | 24 | 0 |
df5.shape
(768, 9)
plt.hist(df5.BMI,bins=20,rwidth=0.8)
plt.xlabel('Body Mass Index (BMI)')
plt.ylabel('Count')
plt.show()
Refer to https://www.mathsisfun.com/data/standard-normal-distribution.html for more details
from scipy.stats import norm
import numpy as np
plt.hist(df5.BMI,bins=20,rwidth=0.8,density=True)
plt.xlabel('BMI')
plt.ylabel('Count')
rng= np.arange(df5.BMI.min(),df5.BMI.max(),0.1)
plt.plot(rng,norm.pdf(rng,df5.BMI.mean(),df5.BMI.std()))
[<matplotlib.lines.Line2D at 0x264af9c21b0>]
df5.BMI.std()
7.8841603203754405
df5.BMI.mean()
31.992578124999998
upper_limit = df5.BMI.mean() + 3 * df5.BMI.std()
upper_limit
55.645059086126324
#Calculating lower_limit
lower_limit = df5.BMI.mean() - 3 * df5.BMI.std()
lower_limit
8.340097163873676
# checking the outliers
df5[(df5.BMI>upper_limit )| (df5.BMI <lower_limit)]
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 9 | 8 | 125 | 96 | 0 | 0 | 0.0 | 0.232 | 54 | 1 |
| 49 | 7 | 105 | 0 | 0 | 0 | 0.0 | 0.305 | 24 | 0 |
| 60 | 2 | 84 | 0 | 0 | 0 | 0.0 | 0.304 | 21 | 0 |
| 81 | 2 | 74 | 0 | 0 | 0 | 0.0 | 0.102 | 22 | 0 |
| 145 | 0 | 102 | 75 | 23 | 0 | 0.0 | 0.572 | 21 | 0 |
| 177 | 0 | 129 | 110 | 46 | 130 | 67.1 | 0.319 | 26 | 1 |
| 371 | 0 | 118 | 64 | 23 | 89 | 0.0 | 1.731 | 21 | 0 |
| 426 | 0 | 94 | 0 | 0 | 0 | 0.0 | 0.256 | 25 | 0 |
| 445 | 0 | 180 | 78 | 63 | 14 | 59.4 | 2.420 | 25 | 1 |
| 494 | 3 | 80 | 0 | 0 | 0 | 0.0 | 0.174 | 22 | 0 |
| 522 | 6 | 114 | 0 | 0 | 0 | 0.0 | 0.189 | 26 | 0 |
| 673 | 3 | 123 | 100 | 35 | 240 | 57.3 | 0.880 | 22 | 0 |
| 684 | 5 | 136 | 82 | 0 | 0 | 0.0 | 0.640 | 69 | 0 |
| 706 | 10 | 115 | 0 | 0 | 0 | 0.0 | 0.261 | 30 | 1 |
df_no_outlier_std_dev = df[(df5.BMI<upper_limit ) & (df5.BMI>lower_limit)]
df_no_outlier_std_dev.shape
(754, 9)
df5.shape[0] - df_no_outlier_std_dev.shape[0]
14
1.3 Outlier detection and removal using Z score Z score indicates how many standard deviation away a datapoint is. For example in our case mean is 66.37 and the standard deviation is 3.84 If a value of data point is 77.91, then Z score for that is 3 because it is 3 standard deviation away (77.91 = 66.37 + 3 * 3.84) Score is refer to data point below 1.4 z = (x – ) /
df5['zscore'] = (df5.BMI - df5.BMI.mean())/df5.BMI.std()
df5.head(5)
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | zscore | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.203880 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | -0.683976 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | -1.102537 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | -0.493721 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 1.408828 |
#outliers
df5[(df5['zscore']>3) | (df5['zscore']<-3)]
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | zscore | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 8 | 125 | 96 | 0 | 0 | 0.0 | 0.232 | 54 | 1 | -4.057829 |
| 49 | 7 | 105 | 0 | 0 | 0 | 0.0 | 0.305 | 24 | 0 | -4.057829 |
| 60 | 2 | 84 | 0 | 0 | 0 | 0.0 | 0.304 | 21 | 0 | -4.057829 |
| 81 | 2 | 74 | 0 | 0 | 0 | 0.0 | 0.102 | 22 | 0 | -4.057829 |
| 145 | 0 | 102 | 75 | 23 | 0 | 0.0 | 0.572 | 21 | 0 | -4.057829 |
| 177 | 0 | 129 | 110 | 46 | 130 | 67.1 | 0.319 | 26 | 1 | 4.452906 |
| 371 | 0 | 118 | 64 | 23 | 89 | 0.0 | 1.731 | 21 | 0 | -4.057829 |
| 426 | 0 | 94 | 0 | 0 | 0 | 0.0 | 0.256 | 25 | 0 | -4.057829 |
| 445 | 0 | 180 | 78 | 63 | 14 | 59.4 | 2.420 | 25 | 1 | 3.476264 |
| 494 | 3 | 80 | 0 | 0 | 0 | 0.0 | 0.174 | 22 | 0 | -4.057829 |
| 522 | 6 | 114 | 0 | 0 | 0 | 0.0 | 0.189 | 26 | 0 | -4.057829 |
| 673 | 3 | 123 | 100 | 35 | 240 | 57.3 | 0.880 | 22 | 0 | 3.209907 |
| 684 | 5 | 136 | 82 | 0 | 0 | 0.0 | 0.640 | 69 | 0 | -4.057829 |
| 706 | 10 | 115 | 0 | 0 | 0 | 0.0 | 0.261 | 30 | 1 | -4.057829 |
#removing outliers as per Z score
df_no_outliers = df5[(df5.zscore <3) & (df5.zscore > -3)]
df_no_outliers.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | zscore | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.203880 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | -0.683976 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | -1.102537 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | -0.493721 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 1.408828 |
#checkig the number of ouliers
df5.shape[0] - df_no_outliers.shape[0]
14
df5.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | zscore | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 0.203880 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | -0.683976 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | -1.102537 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | -0.493721 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 1.408828 |
df5['Pregnancies']
0 6
1 1
2 8
3 1
4 0
..
763 10
764 2
765 5
766 1
767 1
Name: Pregnancies, Length: 768, dtype: int64
from collections import Counter
Counter(df5.Pregnancies)
Counter({1: 135,
0: 111,
2: 103,
3: 75,
4: 68,
5: 57,
6: 50,
7: 45,
8: 38,
9: 28,
10: 24,
11: 11,
13: 10,
12: 9,
14: 2,
15: 1,
17: 1})
# to get the frequency
df.Pregnancies.value_counts()
Pregnancies 1 135 0 111 2 103 3 75 4 68 5 57 6 50 7 45 8 38 9 28 10 24 11 11 13 10 12 9 14 2 15 1 17 1 Name: count, dtype: int64
#checking the type
type(df5.Pregnancies.value_counts())
pandas.core.series.Series
g1 = df5['Pregnancies'].value_counts()
g1.plot()
<Axes: xlabel='Pregnancies'>
#Visualizing through a bar chart
g1.plot(kind='bar')
<Axes: xlabel='Pregnancies'>
#Visualizing through pie chart
g1.plot(kind='pie')
<Axes: ylabel='count'>
Refer to https://medium.com/analytics-vidhya/intro-to-univariate-analysis-de75454b4719 for more details
tdf = pd.read_csv('D:/Pdf/college/titanic.csv')
tdf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 156 entries, 0 to 155 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 156 non-null int64 1 Survived 156 non-null int64 2 Pclass 156 non-null int64 3 Lname 156 non-null object 4 Name 156 non-null object 5 Sex 156 non-null object 6 Age 126 non-null float64 7 SibSp 156 non-null int64 8 Parch 156 non-null int64 9 Ticket 156 non-null object 10 Fare 156 non-null float64 11 Cabin 31 non-null object 12 Embarked 155 non-null object dtypes: float64(2), int64(5), object(6) memory usage: 16.0+ KB
#checking top 5 recordsof the dataset
tdf.head()
| PassengerId | Survived | Pclass | Lname | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund | Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings | Mrs. John Bradley (Florence Briggs Thayer) | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen | Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle | Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen | Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
#Let's visualize the survivors count
sns.countplot(x=tdf.Survived)
plt.show()
# Lets check how many males and females were survived
sns.countplot(x=titanic_df.Survived,hue=titanic_df.Sex)
plt.show()
#Let's distinguish the data by passenger class
sns.countplot(x=titanic_df.Survived,hue=titanic_df.Pclass)
plt.show()
1.4.1 Numerical Data
tips_df = pd.read_csv('D:/docs/downloads/tips.csv')
tips_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null object 3 smoker 244 non-null object 4 day 244 non-null object 5 time 244 non-null object 6 size 244 non-null int64 dtypes: float64(2), int64(1), object(4) memory usage: 13.5+ KB
#checking top 5 recordsof the dataset
tips_df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
#distribution of tips
sns.distplot(x=tips_df.tip, hist=True, kde=True, color='r')
<Axes: ylabel='Density'>
# to find out the range
sns.boxplot(tips_df.tip, color = 'b')
<Axes: ylabel='tip'>
fig, axes = plt.subplots(1,2, figsize =(12,6))
#distribution of tips
sns.distplot(x=tips_df.tip, hist=True, kde=True, color='r',ax=axes[0])
# to find out the range
sns.boxplot(tips_df.tip, color = 'b',ax=axes[1])
axes[0].set_title("Distribution of Tips",fontsize=15)
axes[1].set_title("Tips Range",fontsize=15)
Text(0.5, 1.0, 'Tips Range')
1.5 Subplots
x = np.linspace(-1,1,101)
y1 = x
y2 = x**2
y3 = x**3
y4 = x**4
fig,ax = plt.subplots(2,1)
ax[0].plot(x,y1,color ='b')
ax[1].plot(x,y2, color='r')
[<matplotlib.lines.Line2D at 0x264b43b4b30>]
fig,ax = plt.subplots(2,2)
ax[0,0].plot(x,y1, color ='b')
ax[0,1].plot(x,y2, color='r')
ax[1,0].plot(x,y3, color='y')
ax[1,1].plot(x,y4, color='b')
fig.show()
#Load the dataset
d = pd.read_csv('C:/Users/Acer/Downloads/diamonds.csv')
d.head(20)
| Unnamed: 0 | carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 2 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 3 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 4 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 5 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
| 5 | 6 | 0.24 | Very Good | J | VVS2 | 62.8 | 57.0 | 336 | 3.94 | 3.96 | 2.48 |
| 6 | 7 | 0.24 | Very Good | I | VVS1 | 62.3 | 57.0 | 336 | 3.95 | 3.98 | 2.47 |
| 7 | 8 | 0.26 | Very Good | H | SI1 | 61.9 | 55.0 | 337 | 4.07 | 4.11 | 2.53 |
| 8 | 9 | 0.22 | Fair | E | VS2 | 65.1 | 61.0 | 337 | 3.87 | 3.78 | 2.49 |
| 9 | 10 | 0.23 | Very Good | H | VS1 | 59.4 | 61.0 | 338 | 4.00 | 4.05 | 2.39 |
| 10 | 11 | 0.30 | Good | J | SI1 | 64.0 | 55.0 | 339 | 4.25 | 4.28 | 2.73 |
| 11 | 12 | 0.23 | Ideal | J | VS1 | 62.8 | 56.0 | 340 | 3.93 | 3.90 | 2.46 |
| 12 | 13 | 0.22 | Premium | F | SI1 | 60.4 | 61.0 | 342 | 3.88 | 3.84 | 2.33 |
| 13 | 14 | 0.31 | Ideal | J | SI2 | 62.2 | 54.0 | 344 | 4.35 | 4.37 | 2.71 |
| 14 | 15 | 0.20 | Premium | E | SI2 | 60.2 | 62.0 | 345 | 3.79 | 3.75 | 2.27 |
| 15 | 16 | 0.32 | Premium | E | I1 | 60.9 | 58.0 | 345 | 4.38 | 4.42 | 2.68 |
| 16 | 17 | 0.30 | Ideal | I | SI2 | 62.0 | 54.0 | 348 | 4.31 | 4.34 | 2.68 |
| 17 | 18 | 0.30 | Good | J | SI1 | 63.4 | 54.0 | 351 | 4.23 | 4.29 | 2.70 |
| 18 | 19 | 0.30 | Good | J | SI1 | 63.8 | 56.0 | 351 | 4.23 | 4.26 | 2.71 |
| 19 | 20 | 0.30 | Very Good | J | SI1 | 62.7 | 59.0 | 351 | 4.21 | 4.27 | 2.66 |
#checking the shape
d.shape
(53940, 11)
#filter the value where clarity is in {'SI1','VI2'}
d1 =d[d.clarity.isin(['SI1','VS2'])]
#checkig the shape after filtering
d1.shape
(25323, 11)
Countplots
sns.set_style('darkgrid')
sns.countplot(x='color',data=d1)
<Axes: xlabel='color', ylabel='count'>
#checking the value counts of color
d1.color.value_counts()
color E 4896 F 4332 G 4323 H 3918 D 3780 I 2593 J 1481 Name: count, dtype: int64
#use y to change the orientation instead of x
sns.countplot(y='cut',data=d1)
<Axes: xlabel='count', ylabel='cut'>
#checking the datatypes
d1.dtypes
Unnamed: 0 int64 carat float64 cut object color object clarity object depth float64 table float64 price int64 x float64 y float64 z float64 dtype: object
#Providing the order
color_order = ['J','I','H','G','F','E','D']
sns.countplot(x='color',data=d1,order=color_order);
30
30
1.6 Order Ascending or Descending
#checking the value counts
d1.color.value_counts()
color E 4896 F 4332 G 4323 H 3918 D 3780 I 2593 J 1481 Name: count, dtype: int64
#checking the Index
d1.color.value_counts().index
Index(['E', 'F', 'G', 'H', 'D', 'I', 'J'], dtype='object', name='color')
#plotting as per index (plotting in descending order)
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index);
#plotting in ascending order
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1]);
# setting the color to Black
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1],color='lightblue');
# setting the color to Black
sns.countplot(x='color', data=d1, order=d1.color.value_counts().index[::-1],palette='twilight');
#line width and edge color
sns.countplot(x='color',data=d1, lw=4, ec='black')
<Axes: xlabel='color', ylabel='count'>
#hatching
sns.countplot(x='color',data=d1, lw=4, ec='black',hatch='/')
<Axes: xlabel='color', ylabel='count'>
1.7 Scatter plot
y1 = np.random.randn(70)
y2 = np.random.randn(70)
# Both y1 and y2 should be of same size
plt.scatter(y1,y2)
plt.show()
## adjust axis, if required
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(y1,y2)
plt.axis([-3,3,-3,3]) #plt.axis([Xmin,Xmax,Ymin,Ymax])
plt.show()
# using the scatter plot with comman axis
x = np.arange(70)
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(x,y1, marker='o', label ='y1 with x')
plt.scatter(x,y2, marker='v', label ='y2 with x')
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0x264b5293ec0>
#assigning the colors
x = np.arange(70)
y1 = np.random.randn(70)
y2 = np.random.randn(70)
plt.scatter(x,y1, marker='o', label ='y1 with x',color='g')
plt.scatter(x,y2, marker='v', label ='y2 with x',color='b')
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0x264b7b6c320>
1.8 Line Plot
!pip install yfinance
Collecting yfinance
Downloading yfinance-0.2.56-py2.py3-none-any.whl.metadata (5.8 kB)
Requirement already satisfied: pandas>=1.3.0 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.2.2)
Requirement already satisfied: numpy>=1.16.5 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (1.26.4)
Requirement already satisfied: requests>=2.31 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.32.2)
Collecting multitasking>=0.0.7 (from yfinance)
Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Requirement already satisfied: platformdirs>=2.0.0 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (3.10.0)
Requirement already satisfied: pytz>=2022.5 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2024.1)
Requirement already satisfied: frozendict>=2.3.4 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (2.4.2)
Collecting peewee>=3.16.2 (from yfinance)
Downloading peewee-3.17.9.tar.gz (3.0 MB)
---------------------------------------- 0.0/3.0 MB ? eta -:--:--
---------------------------------------- 0.0/3.0 MB ? eta -:--:--
---------------------------------------- 0.0/3.0 MB ? eta -:--:--
--------------------------------------- 0.0/3.0 MB ? eta -:--:--
--------------------------------------- 0.0/3.0 MB ? eta -:--:--
- -------------------------------------- 0.1/3.0 MB 1.8 MB/s eta 0:00:02
-- ------------------------------------- 0.2/3.0 MB 1.2 MB/s eta 0:00:03
-- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
-- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
-- ------------------------------------- 0.2/3.0 MB 1.1 MB/s eta 0:00:03
---- ----------------------------------- 0.3/3.0 MB 996.0 kB/s eta 0:00:03
----- ---------------------------------- 0.4/3.0 MB 983.0 kB/s eta 0:00:03
----- ---------------------------------- 0.4/3.0 MB 995.2 kB/s eta 0:00:03
------ --------------------------------- 0.5/3.0 MB 1.1 MB/s eta 0:00:03
------- -------------------------------- 0.6/3.0 MB 1.1 MB/s eta 0:00:03
-------- ------------------------------- 0.6/3.0 MB 1.2 MB/s eta 0:00:03
-------- ------------------------------- 0.6/3.0 MB 1.2 MB/s eta 0:00:03
--------- ------------------------------ 0.7/3.0 MB 1.1 MB/s eta 0:00:03
--------- ------------------------------ 0.7/3.0 MB 1.1 MB/s eta 0:00:03
----------- ---------------------------- 0.8/3.0 MB 1.1 MB/s eta 0:00:02
----------- ---------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:03
------------ --------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:02
------------ --------------------------- 0.9/3.0 MB 1.1 MB/s eta 0:00:02
------------- -------------------------- 1.0/3.0 MB 1.1 MB/s eta 0:00:02
------------- -------------------------- 1.0/3.0 MB 1.0 MB/s eta 0:00:02
-------------- ------------------------- 1.1/3.0 MB 1.1 MB/s eta 0:00:02
--------------- ------------------------ 1.2/3.0 MB 1.1 MB/s eta 0:00:02
--------------- ------------------------ 1.2/3.0 MB 1.1 MB/s eta 0:00:02
--------------- ------------------------ 1.2/3.0 MB 982.6 kB/s eta 0:00:02
---------------- ----------------------- 1.2/3.0 MB 1.0 MB/s eta 0:00:02
----------------- ---------------------- 1.3/3.0 MB 1.0 MB/s eta 0:00:02
----------------- ---------------------- 1.3/3.0 MB 1.0 MB/s eta 0:00:02
------------------ --------------------- 1.4/3.0 MB 1.0 MB/s eta 0:00:02
------------------ --------------------- 1.4/3.0 MB 1.0 MB/s eta 0:00:02
------------------- -------------------- 1.5/3.0 MB 1.0 MB/s eta 0:00:02
-------------------- ------------------- 1.5/3.0 MB 1.0 MB/s eta 0:00:02
-------------------- ------------------- 1.6/3.0 MB 1.0 MB/s eta 0:00:02
-------------------- ------------------- 1.6/3.0 MB 1.0 MB/s eta 0:00:02
--------------------- ------------------ 1.6/3.0 MB 992.2 kB/s eta 0:00:02
--------------------- ------------------ 1.6/3.0 MB 992.2 kB/s eta 0:00:02
---------------------- ----------------- 1.7/3.0 MB 991.5 kB/s eta 0:00:02
----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
----------------------- ---------------- 1.8/3.0 MB 1.0 MB/s eta 0:00:02
------------------------ --------------- 1.9/3.0 MB 966.5 kB/s eta 0:00:02
------------------------- -------------- 1.9/3.0 MB 977.7 kB/s eta 0:00:02
------------------------- -------------- 2.0/3.0 MB 965.1 kB/s eta 0:00:02
-------------------------- ------------- 2.0/3.0 MB 980.5 kB/s eta 0:00:02
--------------------------- ------------ 2.1/3.0 MB 975.5 kB/s eta 0:00:01
--------------------------- ------------ 2.1/3.0 MB 968.5 kB/s eta 0:00:01
---------------------------- ----------- 2.1/3.0 MB 966.4 kB/s eta 0:00:01
---------------------------- ----------- 2.2/3.0 MB 971.3 kB/s eta 0:00:01
----------------------------- ---------- 2.2/3.0 MB 967.1 kB/s eta 0:00:01
----------------------------- ---------- 2.2/3.0 MB 967.1 kB/s eta 0:00:01
------------------------------ --------- 2.3/3.0 MB 957.2 kB/s eta 0:00:01
------------------------------ --------- 2.3/3.0 MB 945.0 kB/s eta 0:00:01
------------------------------- -------- 2.4/3.0 MB 968.4 kB/s eta 0:00:01
------------------------------- -------- 2.4/3.0 MB 968.4 kB/s eta 0:00:01
-------------------------------- ------- 2.5/3.0 MB 952.9 kB/s eta 0:00:01
-------------------------------- ------- 2.5/3.0 MB 945.5 kB/s eta 0:00:01
--------------------------------- ------ 2.5/3.0 MB 953.8 kB/s eta 0:00:01
--------------------------------- ------ 2.5/3.0 MB 953.8 kB/s eta 0:00:01
---------------------------------- ----- 2.6/3.0 MB 943.5 kB/s eta 0:00:01
----------------------------------- ---- 2.7/3.0 MB 953.3 kB/s eta 0:00:01
----------------------------------- ---- 2.7/3.0 MB 936.1 kB/s eta 0:00:01
------------------------------------ --- 2.7/3.0 MB 938.6 kB/s eta 0:00:01
------------------------------------ --- 2.7/3.0 MB 938.6 kB/s eta 0:00:01
------------------------------------- -- 2.8/3.0 MB 931.8 kB/s eta 0:00:01
------------------------------------- -- 2.9/3.0 MB 938.9 kB/s eta 0:00:01
-------------------------------------- - 2.9/3.0 MB 941.2 kB/s eta 0:00:01
-------------------------------------- - 2.9/3.0 MB 941.2 kB/s eta 0:00:01
--------------------------------------- 3.0/3.0 MB 929.7 kB/s eta 0:00:01
--------------------------------------- 3.0/3.0 MB 924.3 kB/s eta 0:00:01
---------------------------------------- 3.0/3.0 MB 923.4 kB/s eta 0:00:00
Installing build dependencies: started
Installing build dependencies: still running...
Installing build dependencies: finished with status 'done'
Getting requirements to build wheel: started
Getting requirements to build wheel: finished with status 'done'
Preparing metadata (pyproject.toml): started
Preparing metadata (pyproject.toml): finished with status 'done'
Requirement already satisfied: beautifulsoup4>=4.11.1 in c:\users\acer\anaconda3\lib\site-packages (from yfinance) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\acer\anaconda3\lib\site-packages (from beautifulsoup4>=4.11.1->yfinance) (2.5)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=1.3.0->yfinance) (2.9.0.post0)
Requirement already satisfied: tzdata>=2022.7 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=1.3.0->yfinance) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.31->yfinance) (2024.12.14)
Requirement already satisfied: six>=1.5 in c:\users\acer\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.3.0->yfinance) (1.16.0)
Downloading yfinance-0.2.56-py2.py3-none-any.whl (113 kB)
---------------------------------------- 0.0/113.7 kB ? eta -:--:--
--- ------------------------------------ 10.2/113.7 kB ? eta -:--:--
-------------- ------------------------ 41.0/113.7 kB 653.6 kB/s eta 0:00:01
------------------------------- ------- 92.2/113.7 kB 871.5 kB/s eta 0:00:01
-------------------------------------- 113.7/113.7 kB 663.1 kB/s eta 0:00:00
Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Building wheels for collected packages: peewee
Building wheel for peewee (pyproject.toml): started
Building wheel for peewee (pyproject.toml): finished with status 'done'
Created wheel for peewee: filename=peewee-3.17.9-py3-none-any.whl size=139127 sha256=d4e0fc6219e187c4aedf623d122849492ef79b7793b740305def523e358a9d50
Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\43\ef\2d\2c51d496bf084945ffdf838b4cc8767b8ba1cc20eb41588831
Successfully built peewee
Installing collected packages: peewee, multitasking, yfinance
Successfully installed multitasking-0.0.11 peewee-3.17.9 yfinance-0.2.56
#pip install --upgrade pandas_datareader
Requirement already satisfied: pandas_datareader in c:\users\acer\anaconda3\lib\site-packages (0.10.0) Requirement already satisfied: lxml in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (5.2.1) Requirement already satisfied: pandas>=0.23 in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (2.2.2) Requirement already satisfied: requests>=2.19.0 in c:\users\acer\anaconda3\lib\site-packages (from pandas_datareader) (2.32.2) Requirement already satisfied: numpy>=1.26.0 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (1.26.4) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\acer\anaconda3\lib\site-packages (from pandas>=0.23->pandas_datareader) (2023.3) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2.2.2) Requirement already satisfied: certifi>=2017.4.17 in c:\users\acer\anaconda3\lib\site-packages (from requests>=2.19.0->pandas_datareader) (2024.12.14) Requirement already satisfied: six>=1.5 in c:\users\acer\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.23->pandas_datareader) (1.16.0) Note: you may need to restart the kernel to use updated packages.
# extract data from various Internet sources into a pandas DataFrame
#import pandas_datareader as pdr
import yfinance as yf
#extracting Data
stocks = ['GOOG', 'AMZN']
#data = pdr.get_data_yahoo(stocks, start = '2022-01-01')['Close']
data = yf.download(stocks, start='2022-01-01')['Close']
data.head()
YF.download() has changed argument auto_adjust default to True
[*********************100%***********************] 2 of 2 completed
1 Failed download:
['GOOG']: ConnectionError(ReadTimeoutError("HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out."))
| Ticker | AMZN | GOOG |
|---|---|---|
| Date | ||
| 2022-01-03 | 170.404495 | NaN |
| 2022-01-04 | 167.522003 | NaN |
| 2022-01-05 | 164.356995 | NaN |
| 2022-01-06 | 163.253998 | NaN |
| 2022-01-07 | 162.554001 | NaN |
from matplotlib import rcParams
rcParams['figure.figsize'] = 10,6
plt.plot(data.AMZN)
plt.plot(data.GOOG)
plt.grid(True, color='k', linestyle=':')
plt.title("Amazon & Google Prices")
plt.xlabel("Date")
Text(0.5, 0, 'Date')
from matplotlib import rcParams
rcParams['figure.figsize'] = 10,6
plt.plot(data.AMZN)
plt.plot(data.GOOG)
plt.grid(True, color='k', linestyle=':')
plt.title("Amazon & Google Prices")
plt.xlabel("Date")
plt.legend(['GOOG','AMZN'],loc =2)
<matplotlib.legend.Legend at 0x264ba57f260>
1.8.1 Refer to below link for more details • https://matplotlib.org/3.1.1/gallery/style_sheets/style_sheets_reference.html • https://matplotlib.org/3.1.1/api/markers_api.html formore details
#loading the Dataset
tips = pd.read_csv('D:/docs/downloads/tips.csv')
tips.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
1.9 Boxplot
sns.boxplot(tips['size'])
<Axes: ylabel='size'>
sns.boxplot(tips['total_bill'])
<Axes: ylabel='total_bill'>
sns.boxplot(x='sex', y ='total_bill', data=tips)
<Axes: xlabel='sex', ylabel='total_bill'>
sns.boxplot(x='day', y ='total_bill', data=tips)
<Axes: xlabel='day', ylabel='total_bill'>
sns.boxplot(x='day', y ='total_bill', data=tips,hue='sex')
<Axes: xlabel='day', ylabel='total_bill'>
sns.boxplot(x='day', y ='total_bill', data=tips,hue='sex',palette ='husl')
<Axes: xlabel='day', ylabel='total_bill'>
sns.boxplot(x='day', y ='total_bill', data=tips,hue='smoker',palette='coolwarm')
<Axes: xlabel='day', ylabel='total_bill'>
sns.boxplot(x='day', y ='total_bill', data=tips,hue='time')
<Axes: xlabel='day', ylabel='total_bill'>
1.10 Joint Distribution
iris = pd.read_csv('C:/Users/Acer/Downloads/iris.csv')
sns.jointplot(x='total_bill', y='tip', data=tips)
<seaborn.axisgrid.JointGrid at 0x264bb1a0f80>
iris.columns
Index(['150', '4', 'setosa', 'versicolor', 'virginica'], dtype='object')
# sns.jointplot(x='SepalLengthCm', y='SepalWidthCm', data=iris)
# sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg',color='green')
# adding regression lines
sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg')
<seaborn.axisgrid.JointGrid at 0x264bdd90170>
1.11 Bar Plots
sns.barplot(x='day', y ='tip',data=tips)
<Axes: xlabel='day', ylabel='tip'>
sns.barplot(x='day', y ='total_bill',data=tips)
<Axes: xlabel='day', ylabel='total_bill'>
sns.barplot(x='day', y ='total_bill',data=tips,hue='sex')
<Axes: xlabel='day', ylabel='total_bill'>
sns.barplot(x='day', y ='total_bill',data=tips,hue='sex', palette='winter_r')
<Axes: xlabel='day', ylabel='total_bill'>
sns.barplot(x='day', y ='total_bill',data=tips,hue='smoker')
<Axes: xlabel='day', ylabel='total_bill'>
sns.barplot(x='total_bill', y ='day' , data = tips, palette ='spring')
<Axes: xlabel='total_bill', ylabel='day'>
sns.barplot(x='day', y ='tip' , data = tips, palette='spring',order=['Thur','Fri','Sat','Sun'])
<Axes: xlabel='day', ylabel='tip'>
sns.barplot(x='smoker', y ='tip' , data = tips,hue='sex')
<Axes: xlabel='smoker', ylabel='tip'>
# #Laod the dataset
# df = pd.read_csv('mtcars.csv')
iris = pd.read_csv('C:/Users/Acer/Downloads/iris.csv')
iris.head()
| 150 | 4 | setosa | versicolor | virginica | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
sns.pairplot(iris)
<seaborn.axisgrid.PairGrid at 0x264bb16d760>
sns.pairplot(iris,hue='versicolor')
<seaborn.axisgrid.PairGrid at 0x264b9e68470>
# sns.pairplot(iris,hue='Species')
# "D:\docs\downloads\cars.csv"
#Laod the dataset
df6 = pd.read_csv('D:/docs/downloads/cars.csv')
# Viewing first 5 observations
df.head()
| id | Manufacturer | Model | Type | Min.Price | Price | Max.Price | MPG.city | MPG.highway | AirBags | ... | Passengers | Length | Wheelbase | Width | Turn.circle | Rear.seat.room | Luggage.room | Weight | Origin | Make | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Acura | Integra | Small | 12.9 | 15.9 | 18.8 | 25 | 31 | NaN | ... | 5 | 177 | 102 | 68 | 37 | 26.5 | 11.0 | 2705 | non-USA | Acura Integra |
| 1 | 2 | Acura | Legend | Midsize | 29.2 | 33.9 | 38.7 | 18 | 25 | Driver & Passenger | ... | 5 | 195 | 115 | 71 | 38 | 30.0 | 15.0 | 3560 | non-USA | Acura Legend |
| 2 | 3 | Audi | 90 | Compact | 25.9 | 29.1 | 32.3 | 20 | 26 | Driver only | ... | 5 | 180 | 102 | 67 | 37 | 28.0 | 14.0 | 3375 | non-USA | Audi 90 |
| 3 | 4 | Audi | 100 | Midsize | 30.8 | 37.7 | 44.6 | 19 | 26 | NaN | ... | 6 | 193 | 106 | 70 | 37 | 31.0 | 17.0 | 3405 | non-USA | Audi 100 |
| 4 | 5 | BMW | 535i | Midsize | 23.7 | 30.0 | 36.2 | 22 | 30 | Driver only | ... | 4 | 186 | 109 | 69 | 39 | 27.0 | 13.0 | 3640 | non-USA | BMW 535i |
5 rows × 28 columns
df6.plot.scatter(x='Weight', y='MPG.highway')
<Axes: xlabel='Weight', ylabel='MPG.highway'>
sns.regplot(x='Weight', y='MPG.highway', data=df6)
<Axes: xlabel='Weight', ylabel='MPG.highway'>
#changing color
sns.regplot(x='Weight', y='MPG.highway',data=df6, color='g')
<Axes: xlabel='Weight', ylabel='MPG.highway'>
sns.lmplot(x='Weight', y='MPG.highway',data=df6)
<seaborn.axisgrid.FacetGrid at 0x264c3fd2c60>
sns.lmplot(x='Weight', y='MPG.highway',hue='Origin', data=df6)
<seaborn.axisgrid.FacetGrid at 0x264c4fca390>
sns.lmplot(x='Weight', y='MPG.highway',hue='Origin',palette='Set1', data=df6)
<seaborn.axisgrid.FacetGrid at 0x264c3fc2ff0>
sns.lmplot(x='Weight', y='MPG.highway',hue='Origin',markers=['+','o'],palette='Set1', data=df6)
<seaborn.axisgrid.FacetGrid at 0x264c5154440>
# df7 = pd.read_csv('')
# df7.columns
Index(['A_id', 'Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness',
'Ripeness', 'Acidity', 'Quality'],
dtype='object')
# df7.head()
| A_id | Size | Weight | Sweetness | Crunchiness | Juiciness | Ripeness | Acidity | Quality | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -3.970049 | -2.512336 | 5.346330 | -1.012009 | 1.844900 | 0.329840 | -0.491590483 | good |
| 1 | 1.0 | -1.195217 | -2.839257 | 3.664059 | 1.588232 | 0.853286 | 0.867530 | -0.722809367 | good |
| 2 | 2.0 | -0.292024 | -1.351282 | -1.738429 | -0.342616 | 2.838636 | -0.038033 | 2.621636473 | bad |
| 3 | 3.0 | -0.657196 | -2.271627 | 1.324874 | -0.097875 | 3.637970 | -3.413761 | 0.790723217 | good |
| 4 | 4.0 | 1.364217 | -1.296612 | -0.384658 | -0.553006 | 3.030874 | -1.303849 | 0.501984036 | good |
df7.dtypes
A_id float64 Size float64 Weight float64 Sweetness float64 Crunchiness float64 Juiciness float64 Ripeness float64 Acidity object Quality object dtype: object
# sns.heatmap(df7.corr())
# sns.heatmap(df.corr(), annot =True)
# plt.figure(figsize=(10,5))
# sns.heatmap(df.corr(), annot =True)
1.12 Refer to below Link for the datasets: https://drive.google.com/drive/folders/1QZxDigr3kJCTjtDBRvbIujSsiR9E3Nak?usp=sharing 1.13 Refer to below link for “How to choose the right chart” https://towardsdatascience.com/data-visualization-how-to-choose-the-right-chart-part-1- d4c550085ea7