Create a data frame with columns Name and Age.
data=[("Alice", 25), ("Bob", 30), ("Alice", 25), ("Kate", 22)]
cols= ["Name", "Age"]
df = spark.createDataFrame(data, cols)
Approach 1:- Using dropDuplicates()
#Approach 1
dedup_df=df.dropDuplicates(subset=["Name", "Age"])
dedup_df.show()
Approach 2:- Using distinct()
#Approach 2
df.distinct().show()
Approach 3:- Using window function
#Approach 3
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
window_spec=Window.partitionBy("Name", "Age").orderBy("Name")
df_with_row_number=df.withColumn("row_number", row_number().over(window_spec))
dedup_df_windows=df_with_row_number.filter(df_with_row_number.row_number==1).drop("row_number")
dedup_df_windows.show()
Thank you ! Happy Learning !!