From 18b4c52c96d8d74a79816fb96e0b7789f1b3ba11 Mon Sep 17 00:00:00 2001 From: Krishna Kaushik <131583096+kRiShNa-429407@users.noreply.github.com> Date: Sun, 26 May 2024 02:23:42 +0530 Subject: [PATCH 1/7] Add files via upload --- contrib/pandas/Datasets/car-sales-missing-data.csv | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 contrib/pandas/Datasets/car-sales-missing-data.csv diff --git a/contrib/pandas/Datasets/car-sales-missing-data.csv b/contrib/pandas/Datasets/car-sales-missing-data.csv new file mode 100644 index 0000000..e34cd5f --- /dev/null +++ b/contrib/pandas/Datasets/car-sales-missing-data.csv @@ -0,0 +1,11 @@ +Make,Colour,Odometer,Doors,Price +Toyota,White,150043,4,"$4,000" +Honda,Red,87899,4,"$5,000" +Toyota,Blue,,3,"$7,000" +BMW,Black,11179,5,"$22,000" +Nissan,White,213095,4,"$3,500" +Toyota,Green,,4,"$4,500" +Honda,,,4,"$7,500" +Honda,Blue,,4, +Toyota,White,60000,, +,White,31600,4,"$9,700" \ No newline at end of file From 8dc01e263549680c0cda00b53d49f9d33133a603 Mon Sep 17 00:00:00 2001 From: Krishna Kaushik <131583096+kRiShNa-429407@users.noreply.github.com> Date: Sun, 26 May 2024 21:48:11 +0530 Subject: [PATCH 2/7] Add files via upload --- contrib/pandas/Handling_Missing_Values.md | 273 ++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 contrib/pandas/Handling_Missing_Values.md diff --git a/contrib/pandas/Handling_Missing_Values.md b/contrib/pandas/Handling_Missing_Values.md new file mode 100644 index 0000000..7207d81 --- /dev/null +++ b/contrib/pandas/Handling_Missing_Values.md @@ -0,0 +1,273 @@ +# Handling Missing Values in Pandas + +**Upuntil now we're working on complete data i.e not having any missing values. But in real life it is the one of the main problem.** + +*Many datasets arrive with missing data either because it exists and was not collected or it never existed.* + +In Pandas missing data is represented by two values: + +* `None` : None is simply is `keyword` refer as empty or none. +* `NaN` : Acronym for `Not a Number`. + +**There are several useful functions for detecting, removing, and replacing null values in Pandas DataFrame :** + +1. isnull() +2. notnull() +3. dropna() +4. fillna() +5. replace() + +## 2. Checking for missing values using `isnull()` and `notnull()` + +Let's import pandas and our fancy car-sales dataset having some missing values. + + +```python +import pandas as pd +``` + + +```python +car_sales_missing_df = pd.read_csv("https://raw.githubusercontent.com/kRiShNa-429407/learn-python/main/contrib/pandas/Datasets/car-sales-missing-data.csv") +print(car_sales_missing_df) +``` + + Make Colour Odometer Doors Price + 0 Toyota White 150043.0 4.0 $4,000 + 1 Honda Red 87899.0 4.0 $5,000 + 2 Toyota Blue NaN 3.0 $7,000 + 3 BMW Black 11179.0 5.0 $22,000 + 4 Nissan White 213095.0 4.0 $3,500 + 5 Toyota Green NaN 4.0 $4,500 + 6 Honda NaN NaN 4.0 $7,500 + 7 Honda Blue NaN 4.0 NaN + 8 Toyota White 60000.0 NaN NaN + 9 NaN White 31600.0 4.0 $9,700 + + + +```python +## Using isnull() + +print(car_sales_missing_df.isnull()) +``` + + Make Colour Odometer Doors Price + 0 False False False False False + 1 False False False False False + 2 False False True False False + 3 False False False False False + 4 False False False False False + 5 False False True False False + 6 False True True False False + 7 False False True False True + 8 False False False True True + 9 True False False False False + + +Note here: +* `True` means for `NaN` values +* `False` means for no `Nan` values + +If we want to find the number of missing values in each column use `isnull().sum()`. + + +```python +print(car_sales_missing_df.isnull().sum()) +``` + + Make 1 + Colour 1 + Odometer 4 + Doors 1 + Price 2 + dtype: int64 + + +You can also check presense of null values in a single column. + + +```python +print(car_sales_missing_df["Odometer"].isnull()) +``` + + 0 False + 1 False + 2 True + 3 False + 4 False + 5 True + 6 True + 7 True + 8 False + 9 False + Name: Odometer, dtype: bool + + + +```python +## using notnull() + +print(car_sales_missing_df.notnull()) +``` + + Make Colour Odometer Doors Price + 0 True True True True True + 1 True True True True True + 2 True True False True True + 3 True True True True True + 4 True True True True True + 5 True True False True True + 6 True False False True True + 7 True True False True False + 8 True True True False False + 9 False True True True True + + +Note here: +* `True` means no `NaN` values +* `False` means for `NaN` values + +#### A little note here : `isnull()` means having null values so it gives boolean `True` for NaN values. And `notnull()` means having no null values so it gives `True` for no NaN value. + +## 2. Filling missing values using `fillna()`, `replace()`. + + +```python +## Filling missing values with a single value using `fillna` +print(car_sales_missing_df.fillna(0)) +``` + + Make Colour Odometer Doors Price + 0 Toyota White 150043.0 4.0 $4,000 + 1 Honda Red 87899.0 4.0 $5,000 + 2 Toyota Blue 0.0 3.0 $7,000 + 3 BMW Black 11179.0 5.0 $22,000 + 4 Nissan White 213095.0 4.0 $3,500 + 5 Toyota Green 0.0 4.0 $4,500 + 6 Honda 0 0.0 4.0 $7,500 + 7 Honda Blue 0.0 4.0 0 + 8 Toyota White 60000.0 0.0 0 + 9 0 White 31600.0 4.0 $9,700 + + + +```python +## Filling missing values with the previous value using `ffill()` +print(car_sales_missing_df.ffill()) +``` + + Make Colour Odometer Doors Price + 0 Toyota White 150043.0 4.0 $4,000 + 1 Honda Red 87899.0 4.0 $5,000 + 2 Toyota Blue 87899.0 3.0 $7,000 + 3 BMW Black 11179.0 5.0 $22,000 + 4 Nissan White 213095.0 4.0 $3,500 + 5 Toyota Green 213095.0 4.0 $4,500 + 6 Honda Green 213095.0 4.0 $7,500 + 7 Honda Blue 213095.0 4.0 $7,500 + 8 Toyota White 60000.0 4.0 $7,500 + 9 Toyota White 31600.0 4.0 $9,700 + + + +```python +## illing null value with the next ones using 'bfill()' +print(car_sales_missing_df.bfill()) +``` + + Make Colour Odometer Doors Price + 0 Toyota White 150043.0 4.0 $4,000 + 1 Honda Red 87899.0 4.0 $5,000 + 2 Toyota Blue 11179.0 3.0 $7,000 + 3 BMW Black 11179.0 5.0 $22,000 + 4 Nissan White 213095.0 4.0 $3,500 + 5 Toyota Green 60000.0 4.0 $4,500 + 6 Honda Blue 60000.0 4.0 $7,500 + 7 Honda Blue 60000.0 4.0 $9,700 + 8 Toyota White 60000.0 4.0 $9,700 + 9 NaN White 31600.0 4.0 $9,700 + + +#### Filling a null values using `replace()` method + +**Now we are going to replace the all Nan value in the data frame with -125 value** + +*For this we will need numpy also* + + +```python +import numpy as np +``` + + +```python +print(car_sales_missing_df.replace(to_replace = np.nan, value = -125) ) +``` + + Make Colour Odometer Doors Price + 0 Toyota White 150043.0 4.0 $4,000 + 1 Honda Red 87899.0 4.0 $5,000 + 2 Toyota Blue -125.0 3.0 $7,000 + 3 BMW Black 11179.0 5.0 $22,000 + 4 Nissan White 213095.0 4.0 $3,500 + 5 Toyota Green -125.0 4.0 $4,500 + 6 Honda -125 -125.0 4.0 $7,500 + 7 Honda Blue -125.0 4.0 -125 + 8 Toyota White 60000.0 -125.0 -125 + 9 -125 White 31600.0 4.0 $9,700 + + +## 3. Dropping missing values using `dropna()` + +**In order to drop a null values from a dataframe, we used `dropna()` function this function drop Rows/Columns of datasets with Null values in different ways.** + +#### Dropping rows with at least 1 null value. + + +```python +print(car_sales_missing_df.dropna(axis = 0)) ##Now we drop rows with at least one Nan value (Null value) +``` + + Make Colour Odometer Doors Price + 0 Toyota White 150043.0 4.0 $4,000 + 1 Honda Red 87899.0 4.0 $5,000 + 3 BMW Black 11179.0 5.0 $22,000 + 4 Nissan White 213095.0 4.0 $3,500 + + +#### Dropping rows if all values in that row are missing. + + +```python +print(car_sales_missing_df.dropna(how = 'all',axis = 0)) ## If not have leave the row as it is +``` + + Make Colour Odometer Doors Price + 0 Toyota White 150043.0 4.0 $4,000 + 1 Honda Red 87899.0 4.0 $5,000 + 2 Toyota Blue NaN 3.0 $7,000 + 3 BMW Black 11179.0 5.0 $22,000 + 4 Nissan White 213095.0 4.0 $3,500 + 5 Toyota Green NaN 4.0 $4,500 + 6 Honda NaN NaN 4.0 $7,500 + 7 Honda Blue NaN 4.0 NaN + 8 Toyota White 60000.0 NaN NaN + 9 NaN White 31600.0 4.0 $9,700 + + +#### Dropping columns with at least 1 null value + + +```python +print(car_sales_missing_df.dropna(axis = 1)) +``` + + Empty DataFrame + Columns: [] + Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + +Now we drop a columns which have at least 1 missing values. + +**Here the dataset becomes empty after dropna() because each column as atleast 1 null value so it remove that columns resulting in an empty dataframe.** From eeef1dc21c19e1275c92fbbb6923b81d0a0635d0 Mon Sep 17 00:00:00 2001 From: Krishna Kaushik <131583096+kRiShNa-429407@users.noreply.github.com> Date: Sun, 26 May 2024 21:50:20 +0530 Subject: [PATCH 3/7] Update index.md --- contrib/pandas/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/pandas/index.md b/contrib/pandas/index.md index bf677cf..894be52 100644 --- a/contrib/pandas/index.md +++ b/contrib/pandas/index.md @@ -6,3 +6,4 @@ - [Group By Functions with Pandas](GroupBy_Functions_Pandas.md) - [Excel using Pandas DataFrame](excel_with_pandas.md) - [Importing and Exporting Data in Pandas](import-export.md) +- [Handling Missing Values in Pandas](Handling_Missing_Values.md) From be7781b1e35a22cfcbc853bf8cb2c31304308e15 Mon Sep 17 00:00:00 2001 From: Ankit Mahato Date: Mon, 27 May 2024 08:16:52 +0530 Subject: [PATCH 4/7] Rename Handling_Missing_Values.md to handling-missing-values.md --- .../{Handling_Missing_Values.md => handling-missing-values.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename contrib/pandas/{Handling_Missing_Values.md => handling-missing-values.md} (100%) diff --git a/contrib/pandas/Handling_Missing_Values.md b/contrib/pandas/handling-missing-values.md similarity index 100% rename from contrib/pandas/Handling_Missing_Values.md rename to contrib/pandas/handling-missing-values.md From ad6fd6b6233140c0e128cdd424409247921d1b6f Mon Sep 17 00:00:00 2001 From: Ankit Mahato Date: Mon, 27 May 2024 08:17:16 +0530 Subject: [PATCH 5/7] Update index.md --- contrib/pandas/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pandas/index.md b/contrib/pandas/index.md index 894be52..c71c324 100644 --- a/contrib/pandas/index.md +++ b/contrib/pandas/index.md @@ -6,4 +6,4 @@ - [Group By Functions with Pandas](GroupBy_Functions_Pandas.md) - [Excel using Pandas DataFrame](excel_with_pandas.md) - [Importing and Exporting Data in Pandas](import-export.md) -- [Handling Missing Values in Pandas](Handling_Missing_Values.md) +- [Handling Missing Values in Pandas](handling-missing-values.md) From 5d15c73a87290afdf5cf8928f92e4f315b6e4c74 Mon Sep 17 00:00:00 2001 From: Ankit Mahato Date: Mon, 27 May 2024 08:17:31 +0530 Subject: [PATCH 6/7] Update car-sales-missing-data.csv --- contrib/pandas/Datasets/car-sales-missing-data.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pandas/Datasets/car-sales-missing-data.csv b/contrib/pandas/Datasets/car-sales-missing-data.csv index e34cd5f..21a3157 100644 --- a/contrib/pandas/Datasets/car-sales-missing-data.csv +++ b/contrib/pandas/Datasets/car-sales-missing-data.csv @@ -8,4 +8,4 @@ Toyota,Green,,4,"$4,500" Honda,,,4,"$7,500" Honda,Blue,,4, Toyota,White,60000,, -,White,31600,4,"$9,700" \ No newline at end of file +,White,31600,4,"$9,700" From 8c95bb1de779e455c53c52bd22622b8a533d63c4 Mon Sep 17 00:00:00 2001 From: Ankit Mahato Date: Mon, 27 May 2024 08:21:10 +0530 Subject: [PATCH 7/7] Update handling-missing-values.md --- contrib/pandas/handling-missing-values.md | 37 +++++++++-------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/contrib/pandas/handling-missing-values.md b/contrib/pandas/handling-missing-values.md index 7207d81..da6c377 100644 --- a/contrib/pandas/handling-missing-values.md +++ b/contrib/pandas/handling-missing-values.md @@ -1,34 +1,28 @@ # Handling Missing Values in Pandas -**Upuntil now we're working on complete data i.e not having any missing values. But in real life it is the one of the main problem.** - -*Many datasets arrive with missing data either because it exists and was not collected or it never existed.* +In real life, many datasets arrive with missing data either because it exists and was not collected or it never existed. In Pandas missing data is represented by two values: * `None` : None is simply is `keyword` refer as empty or none. * `NaN` : Acronym for `Not a Number`. -**There are several useful functions for detecting, removing, and replacing null values in Pandas DataFrame :** +There are several useful functions for detecting, removing, and replacing null values in Pandas DataFrame: -1. isnull() -2. notnull() -3. dropna() -4. fillna() -5. replace() +1. `isnull()` +2. `notnull()` +3. `dropna()` +4. `fillna()` +5. `replace()` ## 2. Checking for missing values using `isnull()` and `notnull()` Let's import pandas and our fancy car-sales dataset having some missing values. - ```python import pandas as pd -``` - -```python -car_sales_missing_df = pd.read_csv("https://raw.githubusercontent.com/kRiShNa-429407/learn-python/main/contrib/pandas/Datasets/car-sales-missing-data.csv") +car_sales_missing_df = pd.read_csv("Datasets/car-sales-missing-data.csv") print(car_sales_missing_df) ``` @@ -128,7 +122,7 @@ Note here: * `True` means no `NaN` values * `False` means for `NaN` values -#### A little note here : `isnull()` means having null values so it gives boolean `True` for NaN values. And `notnull()` means having no null values so it gives `True` for no NaN value. +`isnull()` means having null values so it gives boolean `True` for NaN values. And `notnull()` means having no null values so it gives `True` for no NaN value. ## 2. Filling missing values using `fillna()`, `replace()`. @@ -191,18 +185,15 @@ print(car_sales_missing_df.bfill()) #### Filling a null values using `replace()` method -**Now we are going to replace the all Nan value in the data frame with -125 value** +Now we are going to replace the all `NaN` value in the data frame with -125 value -*For this we will need numpy also* +For this we will also need numpy ```python import numpy as np -``` - -```python -print(car_sales_missing_df.replace(to_replace = np.nan, value = -125) ) +print(car_sales_missing_df.replace(to_replace = np.nan, value = -125)) ``` Make Colour Odometer Doors Price @@ -220,7 +211,7 @@ print(car_sales_missing_df.replace(to_replace = np.nan, value = -125) ) ## 3. Dropping missing values using `dropna()` -**In order to drop a null values from a dataframe, we used `dropna()` function this function drop Rows/Columns of datasets with Null values in different ways.** +In order to drop a null values from a dataframe, we used `dropna()` function this function drop Rows/Columns of datasets with Null values in different ways. #### Dropping rows with at least 1 null value. @@ -270,4 +261,4 @@ print(car_sales_missing_df.dropna(axis = 1)) Now we drop a columns which have at least 1 missing values. -**Here the dataset becomes empty after dropna() because each column as atleast 1 null value so it remove that columns resulting in an empty dataframe.** +Here the dataset becomes empty after `dropna()` because each column as atleast 1 null value so it remove that columns resulting in an empty dataframe.