import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

dataset = pd.read_csv(r"loan.csv")
# r => row string -> To convert the text (D:\file\path\loan.csv) -> Path

dataset.head(3)

dataset.shape

(614, 13)

dataset.shape[0]

614

dataset.isnull()  # NaN => True,

dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

(dataset.isnull().sum()/dataset.shape[0])*100

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

dataset.isnull().sum().sum()

np.int64(149)

(dataset.isnull().sum().sum()/(dataset.shape[0]*dataset.shape[1]))*100

# ( Total NaN / area ) * 100

np.float64(1.8667000751691305)

dataset.notnull().sum()

Loan_ID              614
Gender               601
Married              611
Dependents           599
Education            614
Self_Employed        582
ApplicantIncome      614
CoapplicantIncome    614
LoanAmount           592
Loan_Amount_Term     600
Credit_History       564
Property_Area        614
Loan_Status          614
dtype: int64

# Total
dataset.notnull().sum().sum()

np.int64(7833)

sns.heatmap(dataset.isnull())
plt.show()

(dataset.isnull().sum().sum()/(dataset.shape[0]*dataset.shape[1]))*100
# ( Total NaN / Area ) * 100

dataset.isnull().sum()

dataSet = dataset.drop(columns=["Credit_History"])
dataSet.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Property_Area         0
Loan_Status           0
dtype: int64

dataSet.head(3)

dataSet.dropna(inplace=True)
dataSet.head(3)

sns.heatmap(dataSet.isnull())
plt.show()

dataSet.shape

(523, 12)

dataSet.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Property_Area        0
Loan_Status          0
dtype: int64

dataSet.isnull().sum()

a = (614 - 253)/ 618 # Remove Row
b = ((614 - 253)/ 618) * 100 # in %

print('Remove_Row ---------------> ' + str(a))
print('Remove_Row_percentage % --> ' + str(b))

#NB : The error you're encountering is due to trying to concatenate a string with a float in your print statements. In Python, you cannot directly concatenate a string and a float using the + operator within the print function.
# To fix this, you need to convert the float variables a and b to strings before concatenating them with the other strings. Here’s how you can correct your code:

Remove_Row ---------------> 0.5841423948220065
Remove_Row_percentage % --> 58.41423948220065

dataset.head(3) 
# it's Old dataset

dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

dataset.fillna(10).head(3)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB

dataset.fillna(method="bfill").head(3)

dataset.fillna(method="ffill").head(30)

dataset.head(1)

# dataset["Gender"].mode()
dataset["Gender"].mode()[0]

'Male'

# dataset["Gender"].fillna(dataset["Gender"].mode()[0],inplace=True) => Permanent Change the dataset, Or
# Or Create a New data_set

data_set = dataset["Gender"].fillna(dataset["Gender"].mode()[0])
data_set

0        Male
1        Male
2        Male
3        Male
4        Male
        ...  
609    Female
610      Male
611      Male
612      Male
613    Female
Name: Gender, Length: 614, dtype: object

# Create a copy of the original dataset
data_set_01 = dataset.copy()

# 1st Collect all .obj (object) Type data from dataset

# data_set_01.select_dtypes(include="object") ------------------> This is the dataset where the obj-Type data present
# data_set_01.select_dtypes(include="object").isnull() ---------> object Type data => True
data_set_01.select_dtypes(include="object").isnull().sum() # ---> Sum of all obj Type data

Loan_ID           0
Gender           13
Married           3
Dependents       15
Education         0
Self_Employed    32
Property_Area     0
Loan_Status       0
dtype: int64

# find only Column names, as a list

data_set_01.select_dtypes(include="object").columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

# find only Column names

for i in data_set_01.select_dtypes(include="object").columns:
    print(i)

Loan_ID
Gender
Married
Dependents
Education
Self_Employed
Property_Area
Loan_Status

# fill Mode-value for every Columns

for i in data_set_01.select_dtypes(include="object").columns:
            data_set_01[i].fillna(data_set_01[i].mode()[0],inplace=True)

C:\Users\akash\AppData\Local\Temp\ipykernel_4000\3464004097.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set_01[i].fillna(data_set_01[i].mode()[0],inplace=True)

data_set_01.select_dtypes(include="object").isnull().sum()

Loan_ID          0
Gender           0
Married          0
Dependents       0
Education        0
Self_Employed    0
Property_Area    0
Loan_Status      0
dtype: int64

data_set_01.isnull().sum() # for All data Types

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Create a copy of the original dataset
data_set_02 = dataset.copy()

data_set_02.head(3) # main dataset

data_set_02.isnull().sum() # All Missing value

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

data_set_02.info() # All data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB

# data_set_02.select_dtypes(include="float64") # show only numeric dataType columns
# But there was float + int

numerical_columns = data_set_02.select_dtypes(include=['int64', 'float64'])
numerical_columns

numerical_columns.select_dtypes(include=['int64', 'float64']).columns # See only Column Names

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

from sklearn.impute import SimpleImputer

si = SimpleImputer(strategy="mean")
si.fit_transform(data_set_02[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History']])

si = SimpleImputer(strategy="mean")
ar = si.fit_transform(numerical_columns[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History']])

# data_set_02.select_dtypes(include="float64").head(3)
numerical_columns.select_dtypes(include=['int64', 'float64']).head(3)

# pd.DataFrame(ar,columns=[ColumnsName])
data_set_03 = pd.DataFrame(ar,columns=numerical_columns.select_dtypes(include=['int64', 'float64']).columns)
data_set_03.head(3)

data_set_03["LoanAmount"].mean() # check mean for one Column (LoanAmount)

np.float64(146.41216216216216)

data_set_03.isnull().sum() # All blank numeric daType (NaN) => filled

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
dtype: int64

#import pandas as pd
#dataset = pd.read_csv("loan.csv")
dataset.head(3)

dataset.isnull().sum()
# Or use, => data_set_04.isnull().sum() ---> After using => data_set_04 = dataset.copy()
# give same result

# Create a copy of the original dataset
data_set_04 = dataset.copy()
data_set_04.isnull().sum() # all missing columns

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

data_set_04["Gender"].fillna(data_set_04["Gender"].mode()[0],inplace=True) #----> fill mode in Gender column
data_set_04["Married"].fillna(data_set_04["Married"].mode()[0],inplace=True) #---> fill mode in Married column

C:\Users\akash\AppData\Local\Temp\ipykernel_4000\3812312211.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set_04["Gender"].fillna(data_set_04["Gender"].mode()[0],inplace=True) #----> fill mode in Gender column
C:\Users\akash\AppData\Local\Temp\ipykernel_4000\3812312211.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set_04["Married"].fillna(data_set_04["Married"].mode()[0],inplace=True) #---> fill mode in Married column

data_set_04.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Loan_ID               0
Gender                0 ==> fill with Mode ✅
Married               0 ==> fill with Mode ✅
Dependents           15
Education             0
Self_Employed        32

...

# Filter out Gender + Married in new Variable\
data_set_05 = data_set_04[["Gender","Married"]]
data_set_05

pd.get_dummies(data_set_05) # Encoding ---> True False ⬇️
# pd.get_dummies(data_set_06).info() ----> 0, 1 => boolean ⬇️

# Assuming 'dataset' is your DataFrame
encoded_dataset = pd.get_dummies(data_set_05)

# Display the first few rows to show 1s and 0s
print("First few rows with binary indicators:")
print(encoded_dataset.head())
print()

# Display the summary information of the DataFrame
# pd.get_dummies(data_set_05).info()
print("Summary information:")
print(encoded_dataset.info())

First few rows with binary indicators:
   Gender_Female  Gender_Male  Married_No  Married_Yes
0          False         True        True        False
1          False         True       False         True
2          False         True       False         True
3          False         True       False         True
4          False         True        True        False

Summary information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Gender_Female  614 non-null    bool 
 1   Gender_Male    614 non-null    bool 
 2   Married_No     614 non-null    bool 
 3   Married_Yes    614 non-null    bool 
dtypes: bool(4)
memory usage: 2.5 KB
None

from sklearn.preprocessing import OneHotEncoder

obj = OneHotEncoder()
obj.fit_transform(data_set_05)  # --> encoded_dataset take it from Top ⬆️

# it create me a sparse-matrix used in Deep-Learning

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1228 stored elements and shape (614, 4)>

# Convert into Array
# Assuming array_01 is your transformed array
# Example array_01 with 8 columns

array_01 = obj.fit_transform(data_set_05).toarray()
array_01

array([[0., 1., 1., 0.],
       [0., 1., 0., 1.],
       [0., 1., 0., 1.],
       ...,
       [0., 1., 0., 1.],
       [0., 1., 0., 1.],
       [1., 0., 1., 0.]])

# Specify the correct column names (assuming array_01 has 8 columns)
column_names = ["Gender_Female", "Gender_Male", "Married_No", "Married_Yes"]
# Create DataFrame with correct column names
df = pd.DataFrame(array_01, columns=column_names)
# Display the DataFrame
print(df.head())  # Displaying first few rows for example

# ✅✅✅ Or directly u can wright => pd.DataFrame(array_01,columns=["Gender_Female", "Gender_Male", "Married_No", "Married_Yes"])

   Gender_Female  Gender_Male  Married_No  Married_Yes
0            0.0          1.0         1.0          0.0
1            0.0          1.0         0.0          1.0
2            0.0          1.0         0.0          1.0
3            0.0          1.0         0.0          1.0
4            0.0          1.0         1.0          0.0

obj_01 = OneHotEncoder(drop="first")
c = obj_01.fit_transform(data_set_05).toarray()
c

array([[1., 0.],
       [1., 1.],
       [1., 1.],
       ...,
       [1., 1.],
       [1., 1.],
       [0., 0.]])

pd.DataFrame(c, columns= ["Gender_Male", "Married_Yes"])

MACHINE LEARNING
                                       |
                    -------------------------------------------------
                    |                                               |
         SUPERVISED LEARNING                            UN-SUPERVISED LEARNING
                |                                               |
        ---------------------                           ---------------------
        |                   |                           |                   |
`CLASSIFICATION`       `REGRESSION`               `CLUSTERING`        `ASSOCIATION`

#import pandas as pd
dF = pd.DataFrame({"name":["Dog", "Cat", "Cow", "Lion", "Ox"]})
dF

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() 
le.fit_transform(dF["name"]) # fit => Trined my model
                            # transform -> to convert

le = LabelEncoder()
dF["en_name"] = le.fit_transform(dF["name"])
dF

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	LP001002	Male	No	0	Graduate	No	5849	0.0	10.0 ✅	360.0	1.0	Urban	Y
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.0	360.0	1.0	Rural	N
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.0	360.0	1.0	Urban	Y
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.0	360.0	1.0	Urban	Y
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.0	360.0	1.0	Urban	Y
5	LP001011	Male	Yes	2	Graduate	Yes	5417	4196.0	267.0	360.0	1.0	Urban	Y
6	LP001013	Male	Yes	0	Not Graduate	No	2333	1516.0	95.0	360.0	1.0	Urban	Y
7	LP001014	Male	Yes	3+	Graduate	No	3036	2504.0	158.0	360.0	0.0	Semiurban	N
8	LP001018	Male	Yes	2	Graduate	No	4006	1526.0	168.0	360.0	1.0	Urban	Y
9	LP001020	Male	Yes	1	Graduate	No	12841	10968.0	349.0	360.0	1.0	Semiurban	N
10	LP001024	Male	Yes	2	Graduate	No	3200	700.0	70.0	360.0	1.0	Urban	Y
11	LP001027	Male	Yes	2	Graduate	10 ❌	2500	1840.0	109.0	360.0	1.0	Urban	Y
12	LP001028	Male	Yes	2	Graduate	No	3073	8106.0	200.0	360.0	1.0	Urban	Y
13	LP001029	Male	No	0	Graduate	No	1853	2840.0	114.0	360.0	1.0	Rural	N
14	LP001030	Male	Yes	2	Graduate	No	1299	1086.0	17.0	120.0	1.0	Urban	Y
15	LP001032	Male	No	0	Graduate	No	4950	0.0	125.0	360.0	1.0	Urban	Y
16	LP001034	Male	No	1	Not Graduate	No	3596	0.0	100.0	240.0	10.0	Urban	Y
17	LP001036	Female	No	0	Graduate	No	3510	0.0	76.0	360.0	0.0	Urban	N
18	LP001038	Male	Yes	0	Not Graduate	No	4887	0.0	133.0	360.0	1.0	Rural	N
19	LP001041	Male	Yes	0	Graduate	10 ❌	2600	3500.0	115.0	10.0 ✅	1.0	Urban	Y
20	LP001043	Male	Yes	0	Not Graduate	No	7660	0.0	104.0	360.0	0.0	Urban	N
21	LP001046	Male	Yes	1	Graduate	No	5955	5625.0	315.0	360.0	1.0	Urban	Y
22	LP001047	Male	Yes	0	Not Graduate	No	2600	1911.0	116.0	360.0	0.0	Semiurban	N
23	LP001050	10 ❌	Yes	2	Not Graduate	No	3365	1917.0	112.0	360.0	0.0	Rural	N
24	LP001052	Male	Yes	1	Graduate	10 ❌	3717	2925.0	151.0	360.0	10.0	Semiurban	N
25	LP001066	Male	Yes	0	Graduate	Yes	9560	0.0	191.0	360.0	1.0	Semiurban	Y
26	LP001068	Male	Yes	0	Graduate	No	2799	2253.0	122.0	360.0	1.0	Semiurban	Y
27	LP001073	Male	Yes	2	Not Graduate	No	4226	1040.0	110.0	360.0	1.0	Urban	Y
28	LP001086	Male	No	0	Not Graduate	No	1442	0.0	35.0	360.0	1.0	Urban	N
29	LP001087	Female	No	2	Graduate	10 ❌	3750	2083.0	120.0	360.0	1.0	Semiurban	Y

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
18	LP001038	Male	Yes	0	Not Graduate	No⬇️	4887	0.0	133.0	360.0	1.0	Rural	N
19	LP001041	Male	Yes	0	Graduate	No✅	2600	3500.0	115.0	360.0	1.0	Urban	Y
20	LP001043	Male	Yes	0	Not Graduate	No	7660	0.0	104.0	360.0	0.0	Urban	N
21	LP001046	Male	Yes	1	Graduate	No	5955	5625.0	315.0	360.0	1.0	Urban	Y
22	LP001047	Male⬇️	Yes	0	Not Graduate	No	2600	1911.0	116.0	360.0⬇️	0.0	Semiurban	N
23	LP001050	Male✅	Yes	2	Not Graduate	No	3365	1917.0	112.0	360.0✅	0.0	Rural	N

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	False	False	False	False	False	False	False	False	True	False	False	False	False
1	False	False	False	False	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...
609	False	False	False	False	False	False	False	False	False	False	False	False	False
610	False	False	False	False	False	False	False	False	False	False	False	False	False
611	False	False	False	False	False	False	False	False	False	False	False	False	False
612	False	False	False	False	False	False	False	False	False	False	False	False	False
613	False	False	False	False	False	False	False	False	False	False	False	False	False

	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History
0	5849.0	0.0	146.412162	360.0	1.0
1	4583.0	1508.0	128.000000	360.0	1.0
2	3000.0	0.0	66.000000	360.0	1.0

	Gender_Male	Married_Yes
0	1.0	0.0
1	1.0	1.0
2	1.0	1.0
3	1.0	1.0
4	1.0	0.0
...	...	...
609	0.0	0.0
610	1.0	1.0
611	1.0	1.0
612	1.0	1.0
613	0.0	0.0

	name
0	Dog
1	Cat
2	Cow
3	Lion
4	Ox

	name	en_name
0	Dog	2
1	Cat	0
2	Cow	1
3	Lion	3
4	Ox	4

	Gender_Male	Married_Yes
0	1.0	0.0
1	1.0	1.0
2	1.0	1.0
3	1.0	1.0
4	1.0	0.0
...	...	...
609	0.0	0.0
610	1.0	1.0
611	1.0	1.0
612	1.0	1.0
613	0.0	0.0

ML¶

Example¶

load the Excel file¶

Total no of Row & Column¶

No, of Column [0] , Row [1]¶

✅ NaN value find¶

NaN => null value¶

Count all True (NaN) Value for Every Column¶

in % every column¶

( sum of every Column / no. of C ) x 100¶

Sum of NaN in every Column => dataset.isnull().sum()¶

Sum of NaN of All Column => dataset.isnull().sum().sum()¶

in % Total¶

% of NaN of full dataset¶

✅ Not NaN Value find¶

✅ Graph Data¶

❌ If 50% Data is NaN => Cancel the DataSheet.¶

❌ If 50% Data missing in Row or Column => Remove it¶

✅ HANDLING MISSING VALUES (DROPPING)¶

☁️ Remove¶

☁️ So, U can use new dataSet¶

☁️ Remove all NaN Row => .dropna()¶

0.0 is not a NaN value¶

NaN = null ⬆️¶

Black => not NaN (data ✅)¶

White => NaN (data missing ❌)¶

Red => no NaN in the dataSet (no data missing in the complete dataSet ✅✅)¶

🦖 old datasate => (614, 13)¶

🦖 New dataSate => (523, 12)¶

☁️ no. of Removing Row¶

✅ HANDLING MISSING VALUES¶

(IMPUTING CATEGORY DATA)¶

🐼 Fill All Missing value with 10¶

But its not the Right Way ⬆️ to fill the Data, because, it's fill the int-Type (10) in the String-Type section.¶

=> .head(30) ⬆️¶

🐼 ☁️ Backward Filling // ☁️ Forward Filling¶

filter Numerical & Categorical data ⬇️¶

⬆️⬇️ Up-Down filling¶

➡️⬅️ side-by-side filling¶

☁️ Mod filling¶

=> Most recitative value in a Column¶

=> ["Gender"].mode() => Male ⬆️¶

fill the Mode-value in the Column ⬇️¶

It's just only a Column operation ⬆️¶

If U want to fill Mode-data => in All .obj-Type data. => 🔁 use Loop¶

All object Type data fill with Mode (in Every Column)¶

Those are numerical Data Type => So, it's cant fill with Object Type (Mode) data¶

✅ HANDLING MISSING VALUES¶

(SCIKIT-LEARN)¶

🐼 1st Organize the dataset & 🐼 filter columns¶

🐼 fill Missing value using Sklearn¶

output:¶

All NaN data fill as a Array => Convert it into dataFrame¶

Old ❌ NaN¶

New ✅ 146.41... Avg.¶

🦖 All NaN -> fill with Mean 146.41...⬆️ -> using SCIKIT-LEARN¶

🌐🦖 this all step use in Pipe-Line , To Automate filter & Deploy ⬆️⬆️¶

✅ ONE HOT ENCODING¶

(AND DUMMY VARIABLES)¶

Computer can't process & Analyze with Categorical data, So we can change into Numerical data.¶

Categorical data¶

☁️ 1st See the Original dataset & missing value NaN¶

✈️ find missing (NaN) values (Categorical data)¶

✈️ output:¶

☁️ fill All Categorical data (Yes - No)¶

☁️ ENCODING¶

🦖 using .get_dummies()¶

🦖 using scikit-learn¶

🌐 scikit-learn download¶

⬆️ This Data is very Big => Remove some columns¶

✅ LABEL ENCODING¶

output:¶

Total no of `Row` & `Column`¶

No, of `Column` [0] , `Row` [1]¶

Count all True (`NaN`) Value for `Every` Column¶

in % `every column`¶

( sum of every `Column` / no. of `C` ) x 100¶

Sum of NaN in `every` `Column` => `dataset.isnull().sum()`¶

Sum of NaN of `All` `Column` => `dataset.isnull().sum().sum()`¶

in % `Total`¶

❌ If 50% Data missing in `Row` or `Column` => Remove it¶

☁️ So, U can use new `dataSet`¶

☁️ Remove all NaN Row => `.dropna()`¶

🐼 Fill All Missing value with `10`¶

=> Most recitative value in a `Column`¶

🦖 All NaN -> fill with `Mean` 146.41...⬆️ -> using SCIKIT-LEARN¶

🌐 scikit-learn download ¶

	Gender_Male	Married_Yes
0	1.0	0.0
1	1.0	1.0
2	1.0	1.0
3	1.0	1.0
4	1.0	0.0
...	...	...
609	0.0	0.0
610	1.0	1.0
611	1.0	1.0
612	1.0	1.0
613	0.0	0.0