import numpy as np # Create np.array([1,2,3]) # 1D np.zeros((3,4)) # 3×4 zeros np.ones((2,3)) # 2×3 ones np.eye(4) # 4×4 identity np.random.randn(3,3) # std normal np.arange(0,10,2) # [0,2,4,6,8] np.linspace(0,1,5) # 5 evenly spaced # Shape & Reshape arr.shape # (3,4) arr.ndim # 2 arr.dtype # float64 arr.reshape(2,6) arr.flatten() arr.T # transpose
# Indexing & Slicing arr[0,1] # element arr[0:2, 1:3] # slice arr[arr>0] # boolean mask arr[[0,2]] # fancy indexing # Math arr + 10 # broadcast arr * arr # element-wise np.dot(A, B) # matrix multiply A @ B # same # Statistics arr.mean() arr.mean(axis=0) arr.std() arr.var() arr.min() arr.max() arr.sum() arr.cumsum() np.percentile(arr, 75) np.corrcoef(x, y) # correlation
import pandas as pd
df = pd.read_csv("data.csv")
df.head(5) df.tail(3)
df.shape df.dtypes
df.describe() df.info()
df.columns df.index
# Select
df["col"] df[["a","b"]]
df.loc[0:5, "col"] # label
df.iloc[0:5, 0:3] # position
df[df["col"] > 5] # filter
# Modify
df["new"] = df["a"] + df["b"]
df.rename(columns={"old":"new"})
df.drop("col", axis=1)
df.drop_duplicates()
df.reset_index(drop=True)
# Missing values
df.isnull().sum() # count NaN
df.dropna() # drop rows
df.dropna(how="all") # all NaN
df.fillna(0) # fill value
df.fillna(df.mean()) # fill mean
df.interpolate() # interpolate
# Type conversion
df["col"].astype(int)
pd.to_datetime(df["date"])
pd.to_numeric(df["val"], errors="coerce")
# String ops (Series.str)
df["text"].str.lower()
df["text"].str.strip()
df["text"].str.contains("pattern")
df["text"].str.extract(r"(\d+)")
df["text"].str.split(",", expand=True)
# GroupBy
df.groupby("col")["val"].mean()
df.groupby("col").agg(
mean=("val","mean"),
count=("val","count"),
std=("val","std")
)
# Pivot tables
df.pivot_table(
values="sales",
index="region",
columns="month",
aggfunc="sum"
)
# Apply
df["col"].apply(lambda x: x**2)
df.apply(lambda row: f(row), axis=1)
# Merge / Join
pd.merge(df1, df2, on="key")
pd.merge(df1, df2, on="k", how="left")
pd.concat([df1, df2], axis=0) # stack rows
pd.concat([df1, df2], axis=1) # stack cols
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1,2,figsize=(12,4))
# Line plot
axes[0].plot(x, y, color="purple", lw=2)
axes[0].set_title("Title")
axes[0].set_xlabel("X"); axes[0].set_ylabel("Y")
axes[0].legend(["label"])
axes[0].grid(alpha=0.3)
# Bar chart
axes[1].bar(categories, values, color="#6b21a8")
# Scatter plot
plt.scatter(x, y, c=labels, cmap="viridis", s=50)
# Histogram
plt.hist(data, bins=30, alpha=0.7)
# Heatmap (seaborn)
import seaborn as sns
sns.heatmap(corr_matrix, annot=True, cmap="purple")
plt.tight_layout()
plt.savefig("plot.png", dpi=150)
plt.show()
| Task | Code |
|---|---|
| Normalise (0–1) | (df-df.min())/(df.max()-df.min()) |
| Standardise | (df-df.mean())/df.std() |
| One-hot encode | pd.get_dummies(df["col"]) |
| Label encode | df["col"].map({"a":0,"b":1}) |
| Cut to bins | pd.cut(df["age"], bins=[0,18,65,99]) |
| Rolling window | df["col"].rolling(7).mean() |
| Shift (lag) | df["col"].shift(1) |
| Rank | df["col"].rank(ascending=False) |
| Nunique | df["col"].nunique() |
| Value counts | df["col"].value_counts(normalize=True) |
# Vectorise — avoid Python loops
df["result"] = df["a"] * df["b"] # fast
# NOT: for i in range(len(df)): ... # slow
# Categorical reduces memory
df["col"] = df["col"].astype("category")
# Read large CSVs efficiently
df = pd.read_csv("big.csv",
usecols=["col1","col2"], # only needed cols
dtype={"id":"int32"}, # smaller dtypes
chunksize=10000) # chunks
# Query string (readable + fast)
df.query("age > 25 and city == 'Mumbai'")
# Parquet for fast read/write
df.to_parquet("data.parquet")
df = pd.read_parquet("data.parquet")