Pandas is a powerful library for data manipulation and analysis. It provides data structures like DataFrame and Series, which allow for easy handling of structured data, including data from CSV files, databases, and more.
import pandas as pd
# Create a Series
series = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
# Create a DataFrame
data = {'Name': ['Alice', 'Bob'], 'Age': [25, 30]}
df = pd.DataFrame(data)
print("Series:\n", series)
print("DataFrame:\n", df)
Series:
a 10
b 20
c 30
dtype: int64
DataFrame:
Name Age
0 Alice 25
1 Bob 30
# Read from a CSV file
df = pd.read_csv('data.csv')
# Export to a CSV file
df.to_csv('output.csv', index=False)
# Read from and write to Excel
df = pd.read_excel('data.xlsx')
df.to_excel('output.xlsx', index=False)
# Inspect the first and last few rows
print(df.head()) # First 5 rows
print(df.tail()) # Last 5 rows
# Get a summary of the data
print(df.info())
# Get statistics
print(df.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 5 non-null object
1 Age 5 non-null int64
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes
None
Age
count 5.000000
mean 35.000000
std 8.366600
min 25.000000
25% 30.000000
50% 35.000000
75% 40.000000
max 45.000000
# Handle missing values
df['Column'].fillna(value=0, inplace=True) # Replace with 0
df.dropna(inplace=True) # Remove rows with missing values
# Remove duplicates
df.drop_duplicates(inplace=True)
# Select a column
ages = df['Age']
# Select multiple columns
subset = df[['Name', 'Age']]
# Select rows using conditions
adults = df[df['Age'] > 18] # or df.query('Age > 18')
# Select rows based on index
row = df.loc[0]
# Select rows and columns by index
value = df.at[0, 'Name']
# Select rows by position
row = df.iloc[0]
# Select rows and columns by position
value = df.iat[0, 0]
# Select rows based on multiple conditions
subset = df[(df['Age'] > 18) & (df['Name' == 'Alice'])]
# Select rows based on a list of values
subset = df[df['Name'].isin(['Alice', 'Bob'])]
# Select rows based on a string match
subset = df[df['Name'].str.contains('A')]
# Group by a column and calculate mean
grouped = df.groupby('Category')['Value'].mean()
# Multiple aggregations
agg = df.groupby('Category').agg({'Value': ['mean', 'sum']})
print("Grouped Data:\n", grouped)
print("Aggregated Data:\n", agg)
Original Data:
Category Value
0 A 20
1 A 30
2 B 30
3 B 30
Grouped Data:
Category
A 25
B 30
Name: Value, dtype: int64
Aggregated Data:
Value
mean sum
Category
A 25 50
B 30 60
# Merge two DataFrames
merged = pd.merge(df1, df2, on='Key')
# Concatenate along rows or columns
concatenated = pd.concat([df1, df2], axis=0)
print("Merged DataFrame:\n", merged)
print("Concatenated DataFrame:\n", concatenated)
Original DataFrames:
Key Value1
0 K1 10
1 K2 20
Key Value2
0 K1 20
1 K2 30
2 K3 40
3 K4 50
Merged DataFrame:
Key Value1 Value2
0 K1 10 20
1 K2 20 30
Concatenated DataFrame:
Key Value1 Value2
0 K1 10 20
1 K2 20 30
2 K3 30 40
3 K4 40 50
# Create a DataFrame with random data
import numpy as np
data = {
'A': np.random.randint(0, 100, size=10),
'B': np.random.randn(10),
}
df = pd.DataFrame(data)
print("Generated DataFrame:\n", df)
Generated DataFrame:
A B
0 12 0.123456
1 45 -0.654321
2 78 0.987654
3 23 -0.123456
4 56 0.456789
5 89 -0.789012
6 34 0.345678
7 67 -0.567890
8 90 0.678901
9 10 -0.234567