These Pandas functions are often used by the data scientists to understand the data and perform other operations.
1. pd.read_csv() – read a csv file
2. df.head() – display top 5 rows by default
3. df.describe() – get descriptive statistics of numeric columns
4. df.apply() – apply a function to column
5. df.groupby() – group the aggregated result by specific column
6. df.rolling() – calculate moving average given a window size
7. df.sort_values() – sort tables by a specific column
8. df.plot() – for visualizing data
9. df.corr() – computes the pairwise correlation of columns, excluding NaN
10. df.sample() – randomly samples a specified number of rows from the
11. pd.to_datetime() – converts a specified column or series to datetime format.
12. df.rename() – Rename columns
13. df.filter() – to subset the rows or columns of a DataFrame based on specific criteria
14. df.drop() – to remove rows or columns from a DataFrame
15. df.fill() – to fill missing values
16. df.rank() – to rank the values
# data source - https://catalog.data.gov/dataset/surface-water-drinking-water-source-areas
#-------------------------------------------------------------------------
# 1. pd.read_csv() - read a csv file
df = pd.read_csv('/content/Surface_Water_Drinking_Water_Source_Areas.csv')
#-------------------------------------------------------------------------
# 2. df.head() - display top 5 rows by default
df.head()
#-------------------------------------------------------------------------
# 3. df.describe() - get descriptive statistics of numeric columns
df.describe()
#-------------------------------------------------------------------------
# 4. df.apply() - apply a function to column
# Define the function to convert acres to square meters
def acres_to_sq_meters(acres):
return acres * 4046.86
# Apply the function to the 'ACRES' column
df['SQ_METERS'] = df['ACRES'].apply(acres_to_sq_meters)
#-------------------------------------------------------------------------
# 5. df.groupby() - group the aggregated result by specific column
# Group by 'PWS_Name' and sum the 'SQ_METERS' column
grouped_df = df.groupby('PWS_Name')['SQ_METERS'].sum()
#-------------------------------------------------------------------------
# 6. df.rolling() - calculate moving average given a window size
# Basic Syntax: df.rolling(window, min_periods=None, center=False, axis=0).function()
# Calculate a 3-row rolling mean for 'SQ_METERS'
df['Rolling_Mean'] = df['SQ_METERS'].rolling(window=3).mean()
#-------------------------------------------------------------------------
# 7. df.sort_values() - sort tables by a specific column
# Basic Syntax: df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
# Sort by the 'SQ_METERS' column in ascending order
sorted_df = df.sort_values(by='SQ_METERS')
#-------------------------------------------------------------------------
# 8. df.plot() - for visualizing data
# Basic Syntax:df.plot(kind='line', x=None, y=None, ax=None, subplots=False, sharex=None, sharey=False, figsize=None, title=None)
# Sorting the DataFrame by 'ACRES' in descending order and selecting the top 5
top_5_acres = df.sort_values(by='ACRES', ascending=False).head(5)
# Bar plot of 'ACRES' for the top 5 'PWS_Name'
top_5_acres.plot(kind='bar', x='PWS_Name', y='ACRES', title='Top 5 PWS Names by ACRES', legend=False)
#-------------------------------------------------------------------------
# 9. df.corr() - computes the pairwise correlation of columns, excluding NaN values.
# Calculate the correlation matrix
correlation_matrix = df.select_dtypes(include=['number']).corr()
#-------------------------------------------------------------------------
# 10. df.sample() - randomly samples a specified number of rows from the DataFrame.
# Get a random sample of 5 rows
sample_data = df.sample(n=5)
#-------------------------------------------------------------------------
# 11. pd.to_datetime() - converts a specified column or series to datetime format.
# Convert the 'DATE_DEL' column to datetime format
df['DATE_DEL'] = pd.to_datetime(df['DATE_DEL'], errors='coerce')
#-------------------------------------------------------------------------
# 12. df.rename() - Rename columns
df.rename(columns={
'OBJECTID_1': 'ID',
'PWS_Name': 'Public_Water_System_Name',
'ACRES': 'Acres_Area',
'DATE_DEL': 'Date_Delivered'
}, inplace=True)
#-------------------------------------------------------------------------
# 13. df.filter() - to subset the rows or columns of a DataFrame based on specific criteria
# Filter specific columns
filtered_columns = df.filter(items=['PWS_Name', 'ACRES'])
# Filter rows where PWS_Name contains 'WATER'
filtered_rows = df.filter(regex='WATER', axis=0)
#-------------------------------------------------------------------------
# 14. df.drop() - to remove rows or columns from a DataFrame
# Drop a row by index
df_dropped_row = df.drop(index=1)
# Drop a column by name
df_dropped_column = df.drop(columns='DATE_DEL')
#-------------------------------------------------------------------------
# 15. df.fill() - to fill missing values
# Fill missing values with specific values
df_filled = df.fillna({
'ACRES': 0,
'DATE_DEL': 'Unknown'
})
# Forward fill missing values
df_ffill = df.fillna(method='ffill')
# Backward fill missing values
df_bfill = df.fillna(method='bfill')
#-------------------------------------------------------------------------
# 16. df.rank() - to rank the values
# Rank the values in the 'ACRES' column
df['ACRES_Rank'] = df['ACRES'].rank()
#-------------------------------------------------------------------------