Skip to content

16 mostly used Pandas functions

These Pandas functions are often used by the data scientists to understand the data and perform other operations.

1. pd.read_csv() – read a csv file

2. df.head() – display top 5 rows by default

3. df.describe() – get descriptive statistics of numeric columns

4. df.apply() – apply a function to column

5. df.groupby() – group the aggregated result by specific column

6. df.rolling() – calculate moving average given a window size

7. df.sort_values() – sort tables by a specific column

8. df.plot() – for visualizing data

9. df.corr() – computes the pairwise correlation of columns, excluding NaN

10. df.sample() – randomly samples a specified number of rows from the

11. pd.to_datetime() – converts a specified column or series to datetime format.

12. df.rename() – Rename columns

13. df.filter() – to subset the rows or columns of a DataFrame based on specific criteria

14. df.drop() – to remove rows or columns from a DataFrame

15. df.fill() – to fill missing values

16. df.rank() – to rank the values

# data source - https://catalog.data.gov/dataset/surface-water-drinking-water-source-areas
#-------------------------------------------------------------------------
# 1. pd.read_csv() - read a csv file
df = pd.read_csv('/content/Surface_Water_Drinking_Water_Source_Areas.csv')
#-------------------------------------------------------------------------
# 2. df.head() - display top 5 rows by default
df.head()
#-------------------------------------------------------------------------
# 3. df.describe() - get descriptive statistics of numeric columns
df.describe()
#-------------------------------------------------------------------------
# 4. df.apply() - apply a function to column
# Define the function to convert acres to square meters
def acres_to_sq_meters(acres):
    return acres * 4046.86

# Apply the function to the 'ACRES' column
df['SQ_METERS'] = df['ACRES'].apply(acres_to_sq_meters)
#-------------------------------------------------------------------------
# 5. df.groupby() - group the aggregated result by specific column
# Group by 'PWS_Name' and sum the 'SQ_METERS' column
grouped_df = df.groupby('PWS_Name')['SQ_METERS'].sum()
#-------------------------------------------------------------------------
# 6. df.rolling() - calculate moving average given a window size
# Basic Syntax: df.rolling(window, min_periods=None, center=False, axis=0).function()
# Calculate a 3-row rolling mean for 'SQ_METERS'
df['Rolling_Mean'] = df['SQ_METERS'].rolling(window=3).mean()
#-------------------------------------------------------------------------
# 7. df.sort_values() - sort tables by a specific column
# Basic Syntax: df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
# Sort by the 'SQ_METERS' column in ascending order
sorted_df = df.sort_values(by='SQ_METERS')
#-------------------------------------------------------------------------
# 8. df.plot() - for visualizing data
# Basic Syntax:df.plot(kind='line', x=None, y=None, ax=None, subplots=False, sharex=None, sharey=False, figsize=None, title=None)
# Sorting the DataFrame by 'ACRES' in descending order and selecting the top 5
top_5_acres = df.sort_values(by='ACRES', ascending=False).head(5)

# Bar plot of 'ACRES' for the top 5 'PWS_Name'
top_5_acres.plot(kind='bar', x='PWS_Name', y='ACRES', title='Top 5 PWS Names by ACRES', legend=False)
#-------------------------------------------------------------------------
# 9. df.corr() - computes the pairwise correlation of columns, excluding NaN values.
# Calculate the correlation matrix
correlation_matrix = df.select_dtypes(include=['number']).corr()
#-------------------------------------------------------------------------
# 10. df.sample() - randomly samples a specified number of rows from the DataFrame.
# Get a random sample of 5 rows
sample_data = df.sample(n=5)
#-------------------------------------------------------------------------
# 11. pd.to_datetime() - converts a specified column or series to datetime format.
# Convert the 'DATE_DEL' column to datetime format
df['DATE_DEL'] = pd.to_datetime(df['DATE_DEL'], errors='coerce')
#-------------------------------------------------------------------------
# 12. df.rename() - Rename columns
df.rename(columns={
    'OBJECTID_1': 'ID',
    'PWS_Name': 'Public_Water_System_Name',
    'ACRES': 'Acres_Area',
    'DATE_DEL': 'Date_Delivered'
}, inplace=True)
#-------------------------------------------------------------------------
# 13. df.filter() - to subset the rows or columns of a DataFrame based on specific criteria
# Filter specific columns
filtered_columns = df.filter(items=['PWS_Name', 'ACRES'])

# Filter rows where PWS_Name contains 'WATER'
filtered_rows = df.filter(regex='WATER', axis=0)
#-------------------------------------------------------------------------
# 14. df.drop() - to remove rows or columns from a DataFrame
# Drop a row by index
df_dropped_row = df.drop(index=1)

# Drop a column by name
df_dropped_column = df.drop(columns='DATE_DEL')
#-------------------------------------------------------------------------
# 15. df.fill() - to fill missing values
# Fill missing values with specific values
df_filled = df.fillna({
    'ACRES': 0,
    'DATE_DEL': 'Unknown'
})

# Forward fill missing values
df_ffill = df.fillna(method='ffill')

# Backward fill missing values
df_bfill = df.fillna(method='bfill')
#-------------------------------------------------------------------------
# 16. df.rank() - to rank the values
# Rank the values in the 'ACRES' column
df['ACRES_Rank'] = df['ACRES'].rank()
#-------------------------------------------------------------------------

Leave a Reply

Your email address will not be published. Required fields are marked *