Pandas is a powerful library for data manipulation and analysis. It provides data structures like DataFrame
and Series
, which allow for easy handling of structured data, including data from CSV files, databases, and more.
import pandas as pd
# Create a Series
series = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
# Create a DataFrame
data = {'Name': ['Alice', 'Bob'], 'Age': [25, 30]}
df = pd.DataFrame(data)
print("Series:\n", series)
print("DataFrame:\n", df)
Series:
a 10
b 20
c 30
dtype: int64
DataFrame:
Name Age
0 Alice 25
1 Bob 30
# Read from a CSV file
df = pd.read_csv('data.csv')
# Export to a CSV file
df.to_csv('output.csv', index=False)
# Read from and write to Excel
df = pd.read_excel('data.xlsx')
df.to_excel('output.xlsx', index=False)
# Inspect the first and last few rows
print(df.head()) # First 5 rows
print(df.tail()) # Last 5 rows
# Get a summary of the data
print(df.info())
# Get statistics
print(df.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 5 non-null object
1 Age 5 non-null int64
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes
None
Age
count 5.000000
mean 35.000000
std 8.366600
min 25.000000
25% 30.000000
50% 35.000000
75% 40.000000
max 45.000000
# Handle missing values
df['Column'].fillna(value=0, inplace=True) # Replace with 0
df.dropna(inplace=True) # Remove rows with missing values
# Remove duplicates
df.drop_duplicates(inplace=True)
# Select a column
ages = df['Age']
# Select multiple columns
subset = df[['Name', 'Age']]
# Select rows using conditions
adults = df[df['Age'] > 18] # or df.query('Age > 18')
# Select rows based on index
row = df.loc[0]
# Select rows and columns by index
value = df.at[0, 'Name']
# Select rows by position
row = df.iloc[0]
# Select rows and columns by position
value = df.iat[0, 0]
# Select rows based on multiple conditions
subset = df[(df['Age'] > 18) & (df['Name' == 'Alice'])]
# Select rows based on a list of values
subset = df[df['Name'].isin(['Alice', 'Bob'])]
# Select rows based on a string match
subset = df[df['Name'].str.contains('A')]
# Group by a column and calculate mean
grouped = df.groupby('Category')['Value'].mean()
# Multiple aggregations
agg = df.groupby('Category').agg({'Value': ['mean', 'sum']})
print("Grouped Data:\n", grouped)
print("Aggregated Data:\n", agg)
Original Data:
Category Value
0 A 20
1 A 30
2 B 30
3 B 30
Grouped Data:
Category
A 25
B 30
Name: Value, dtype: int64
Aggregated Data:
Value
mean sum
Category
A 25 50
B 30 60
# Merge two DataFrames
merged = pd.merge(df1, df2, on='Key')
# Concatenate along rows or columns
concatenated = pd.concat([df1, df2], axis=0)
print("Merged DataFrame:\n", merged)
print("Concatenated DataFrame:\n", concatenated)
Original DataFrames:
Key Value1
0 K1 10
1 K2 20
Key Value2
0 K1 20
1 K2 30
2 K3 40
3 K4 50
Merged DataFrame:
Key Value1 Value2
0 K1 10 20
1 K2 20 30
Concatenated DataFrame:
Key Value1 Value2
0 K1 10 20
1 K2 20 30
2 K3 30 40
3 K4 40 50
# Create a DataFrame with random data
import numpy as np
data = {
'A': np.random.randint(0, 100, size=10),
'B': np.random.randn(10),
}
df = pd.DataFrame(data)
print("Generated DataFrame:\n", df)
Generated DataFrame:
A B
0 12 0.123456
1 45 -0.654321
2 78 0.987654
3 23 -0.123456
4 56 0.456789
5 89 -0.789012
6 34 0.345678
7 67 -0.567890
8 90 0.678901
9 10 -0.234567