Python for Data

NumPy · Pandas · Matplotlib · SciPy · Data Patterns

mitraaiprojects.com

NumPy — Arrays

import numpy as np

# Create
np.array([1,2,3])         # 1D
np.zeros((3,4))           # 3×4 zeros
np.ones((2,3))            # 2×3 ones
np.eye(4)                 # 4×4 identity
np.random.randn(3,3)      # std normal
np.arange(0,10,2)         # [0,2,4,6,8]
np.linspace(0,1,5)        # 5 evenly spaced

# Shape & Reshape
arr.shape    # (3,4)
arr.ndim     # 2
arr.dtype    # float64
arr.reshape(2,6)
arr.flatten()
arr.T        # transpose

NumPy — Operations

# Indexing & Slicing
arr[0,1]         # element
arr[0:2, 1:3]    # slice
arr[arr>0]       # boolean mask
arr[[0,2]]       # fancy indexing

# Math
arr + 10         # broadcast
arr * arr        # element-wise
np.dot(A, B)     # matrix multiply
A @ B            # same

# Statistics
arr.mean()       arr.mean(axis=0)
arr.std()        arr.var()
arr.min()        arr.max()
arr.sum()        arr.cumsum()
np.percentile(arr, 75)
np.corrcoef(x, y)   # correlation

Pandas — DataFrame

import pandas as pd

df = pd.read_csv("data.csv")
df.head(5)         df.tail(3)
df.shape           df.dtypes
df.describe()      df.info()
df.columns         df.index

# Select
df["col"]          df[["a","b"]]
df.loc[0:5, "col"] # label
df.iloc[0:5, 0:3]  # position
df[df["col"] > 5]  # filter

# Modify
df["new"] = df["a"] + df["b"]
df.rename(columns={"old":"new"})
df.drop("col", axis=1)
df.drop_duplicates()
df.reset_index(drop=True)

Pandas — Data Cleaning

# Missing values
df.isnull().sum()       # count NaN
df.dropna()             # drop rows
df.dropna(how="all")    # all NaN
df.fillna(0)            # fill value
df.fillna(df.mean())    # fill mean
df.interpolate()        # interpolate

# Type conversion
df["col"].astype(int)
pd.to_datetime(df["date"])
pd.to_numeric(df["val"], errors="coerce")

# String ops (Series.str)
df["text"].str.lower()
df["text"].str.strip()
df["text"].str.contains("pattern")
df["text"].str.extract(r"(\d+)")
df["text"].str.split(",", expand=True)

Pandas — Aggregation

# GroupBy
df.groupby("col")["val"].mean()
df.groupby("col").agg(
    mean=("val","mean"),
    count=("val","count"),
    std=("val","std")
)

# Pivot tables
df.pivot_table(
    values="sales",
    index="region",
    columns="month",
    aggfunc="sum"
)

# Apply
df["col"].apply(lambda x: x**2)
df.apply(lambda row: f(row), axis=1)

# Merge / Join
pd.merge(df1, df2, on="key")
pd.merge(df1, df2, on="k", how="left")
pd.concat([df1, df2], axis=0)  # stack rows
pd.concat([df1, df2], axis=1)  # stack cols

Matplotlib / Plotting

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1,2,figsize=(12,4))

# Line plot
axes[0].plot(x, y, color="purple", lw=2)
axes[0].set_title("Title")
axes[0].set_xlabel("X"); axes[0].set_ylabel("Y")
axes[0].legend(["label"])
axes[0].grid(alpha=0.3)

# Bar chart
axes[1].bar(categories, values, color="#6b21a8")

# Scatter plot
plt.scatter(x, y, c=labels, cmap="viridis", s=50)

# Histogram
plt.hist(data, bins=30, alpha=0.7)

# Heatmap (seaborn)
import seaborn as sns
sns.heatmap(corr_matrix, annot=True, cmap="purple")

plt.tight_layout()
plt.savefig("plot.png", dpi=150)
plt.show()

Common Data Patterns

Task	Code
Normalise (0–1)	`(df-df.min())/(df.max()-df.min())`
Standardise	`(df-df.mean())/df.std()`
One-hot encode	`pd.get_dummies(df["col"])`
Label encode	`df["col"].map({"a":0,"b":1})`
Cut to bins	`pd.cut(df["age"], bins=[0,18,65,99])`
Rolling window	`df["col"].rolling(7).mean()`
Shift (lag)	`df["col"].shift(1)`
Rank	`df["col"].rank(ascending=False)`
Nunique	`df["col"].nunique()`
Value counts	`df["col"].value_counts(normalize=True)`

Performance Tips

# Vectorise — avoid Python loops
df["result"] = df["a"] * df["b"]   # fast
# NOT: for i in range(len(df)): ...  # slow

# Categorical reduces memory
df["col"] = df["col"].astype("category")

# Read large CSVs efficiently
df = pd.read_csv("big.csv",
    usecols=["col1","col2"],  # only needed cols
    dtype={"id":"int32"},      # smaller dtypes
    chunksize=10000)            # chunks

# Query string (readable + fast)
df.query("age > 25 and city == 'Mumbai'")

# Parquet for fast read/write
df.to_parquet("data.parquet")
df = pd.read_parquet("data.parquet")