Source code for fenn.tabular

from typing import Optional

import numpy as np
import pandas as pd


[docs] def summary(df: pd.DataFrame) -> pd.DataFrame: """ One-shot overview combining shape, dtypes, basic stats, missing value counts, and cardinality info for categorical columns. """ result = pd.DataFrame( { "dtype": df.dtypes, "missing": df.isnull().sum(), "missing_%": (df.isnull().sum() / len(df) * 100).round(2), "unique": df.nunique(), } ) print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns") return result
[docs] def missing_report(df: pd.DataFrame) -> pd.DataFrame: """ Compact report of missing values per column, percentage, and flags for all-null or almost-all-null columns. """ missing = df.isnull().sum() percent = (missing / len(df) * 100).round(2) report = pd.DataFrame( { "missing": missing, "missing_%": percent, "all_null": missing == len(df), "almost_null": percent > 90, } ) return report[report["missing"] > 0]
[docs] def numeric_profile( df: pd.DataFrame, clip_quantile: Optional[float] = None ) -> pd.DataFrame: """ Describe numeric columns only (min, max, mean, std, quantiles) with optional clipping of extreme quantiles. """ numeric_df = df.select_dtypes(include=[np.number]) if clip_quantile is not None: lower = numeric_df.quantile(clip_quantile) upper = numeric_df.quantile(1 - clip_quantile) numeric_df = numeric_df.clip(lower=lower, upper=upper, axis=1) return numeric_df.describe().T
[docs] def quick_sample( df: pd.DataFrame, n: int = 5, columns: Optional[list] = None, seed: Optional[int] = None, ) -> pd.DataFrame: """ Convenience wrapper around head/random sampling, with optional column subset and seed. """ subset = df[columns] if columns else df return subset.sample(n=min(n, len(subset)), random_state=seed)
[docs] def unique_report(df: pd.DataFrame) -> pd.DataFrame: """ Show number of unique values per column and, for low-cardinality columns, a small frequency table. """ result = pd.DataFrame( { "unique_count": df.nunique(), "unique_%": (df.nunique() / len(df) * 100).round(2), } ) return result
[docs] def corr_overview(df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame: """ Compute correlations between numeric columns and return the strongest pairs as a tidy table. """ numeric_df = df.select_dtypes(include=[np.number]) corr_matrix = numeric_df.corr() # Convert to tidy format pairs = ( corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) .stack() .reset_index() ) pairs.columns = pd.Index(["feature_1", "feature_2", "correlation"]) pairs["abs_correlation"] = pairs["correlation"].abs() pairs = pairs.dropna(subset=["correlation"]) return pairs.sort_values("abs_correlation", ascending=False).head(top_n)
[docs] def array_summary(arr: np.ndarray) -> pd.DataFrame: """ NumPy-oriented helper for shape, dtype, basic stats, and NaN checks on ndarray. """ flat = arr.flatten() return pd.DataFrame( [ { "shape": arr.shape, "dtype": arr.dtype, "size": arr.size, "mean": float(np.nanmean(flat)), "std": float(np.nanstd(flat)), "min": float(np.nanmin(flat)), "max": float(np.nanmax(flat)), "nan_count": int(np.isnan(flat).sum()), "nan_%": round(float(np.isnan(flat).mean() * 100), 2), } ] )