import pandas as pd
DataFrame Operations β Filtering, Deduplication, and Unique ValuesΒΆ
This notebook demonstrates essential DataFrame filtering and cleaning techniques using an employees dataset. Key operations include: parsing dates during import with parse_dates, converting columns to appropriate types (bool, category, datetime), boolean filtering with equality and membership tests (.isin()), detecting and removing duplicates with .duplicated() and .drop_duplicates(), and examining column cardinality with .unique() and .nunique(). These techniques are fundamental for data quality assurance and preparing clean datasets for analysis.
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"] )
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head()
output = None
output = None
df["Start Date"]
outpput = None
df["Start Date"] = pd.to_datetime(df["Start Date"])
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
output = None
df
df[df["Gender"] == "Male"]
df["team"]
df[df["Team"].isin(["Marketing","Legal"])]
df.sort_values("First Name", inplace = True)
mask = ~df["First Name"].duplicated(keep = False)
df[mask]
len(df.drop_duplicates(subset = "First Name", keep = False))
df["Gender"].unique()
df["Team"].unique()
df["Team"].nunique(dropna= False)