ETL Utilities#

The etl module provides data wrangling utilities: smoothing, cross-correlation, JSON helpers, time conversions, and more.

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matviz.etl import (
    nan_smooth, start_and_ends, xcorr, max_lag, round_time,
    drop_mostly_na, robust_floater, isdigit, flatten, chunks,
    load_json, dump_json, encode_floats, find_percentile,
    geometric_median, hex2rgb
)
from matviz.viz import nicefy, subplotter, title

nan_smooth#

Smooth a time series while gracefully handling NaN values.

np.random.seed(42)
N = 200
x = np.linspace(0, 4*np.pi, N)
y = np.sin(x) + np.random.randn(N) * 0.3

# Insert some NaN gaps
y[40:55] = np.nan
y[120:135] = np.nan

plt.figure(figsize=(15, 4))

for i, (n, ttl) in enumerate(zip([5, 15, 30], ['n=5', 'n=15', 'n=30'])):
    subplotter(1, 3, i)
    plt.plot(x, y, '.', color='0.7', alpha=0.5, label='raw')
    plt.plot(x, nan_smooth(y, n=n), 'r-', lw=2, label=f'smoothed ({ttl})')
    plt.legend()
    title(ttl)
    nicefy()

plt.show()

/Users/yoni/Documents/repos/matviz/matviz/etl.py:525: RuntimeWarning: invalid value encountered in divide
  outt = ys/enss
/Users/yoni/Documents/repos/matviz/matviz/etl.py:525: RuntimeWarning: invalid value encountered in divide
  outt = ys/enss

../_images/7fb5c13d4e7f7b0e46da0e5ea2d5625952c60a13eea4618b8ebd8ef7dcdd5ce5.png

start_and_ends#

Find contiguous True regions in a boolean array.

signal = np.random.randn(100).cumsum()
above_zero = signal > 0
regions = start_and_ends(above_zero)

print(f'Found {len(regions)} regions where signal > 0:')
for s, e in regions[:5]:
    print(f'  [{s}, {e}]')

Found 1 regions where signal > 0:
  [0, 99]

xcorr / max_lag#

Normalized cross-correlation and lag detection.

dt = 0.01
t = np.arange(0, 10, dt)
true_lag = 0.5  # seconds

a = np.sin(2 * np.pi * t) + np.random.randn(len(t)) * 0.2
b = np.sin(2 * np.pi * (t - true_lag)) + np.random.randn(len(t)) * 0.2

corrs, lags = xcorr(a, b, dt)

plt.figure(figsize=(10, 4))
plt.plot(lags, corrs)
plt.xlabel('Lag (seconds)')
plt.ylabel('Correlation')
title('Cross-correlation')

detected_lag, max_corr_val = max_lag(a, b, dt)
plt.axvline(detected_lag, color='r', linestyle='--', label=f'max_lag={detected_lag:.2f}s')
plt.legend()
nicefy()
plt.show()

print(f'True lag: {true_lag}s, Detected lag: {detected_lag:.3f}s')

../_images/06dfeead365024ef5951134f51aa95170b56a4e6f891fa3c102c7336163e286f.png

True lag: 0.5s, Detected lag: -0.500s

round_time and drop_mostly_na#

import datetime

times = [datetime.datetime(2024, 1, 1, 10, 23, 45),
         datetime.datetime(2024, 1, 1, 10, 47, 12),
         datetime.datetime(2024, 1, 1, 11, 5, 30)]
rounded = round_time(times)
print('Original:', times)
print('Rounded to hour:', rounded.tolist())

Original: [datetime.datetime(2024, 1, 1, 10, 23, 45), datetime.datetime(2024, 1, 1, 10, 47, 12), datetime.datetime(2024, 1, 1, 11, 5, 30)]
Rounded to hour: [Timestamp('2024-01-01 10:00:00'), Timestamp('2024-01-01 11:00:00'), Timestamp('2024-01-01 11:00:00')]

/Users/yoni/Documents/repos/matviz/matviz/etl.py:214: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  return pd.Series(ts).dt.round(round_by)

df = pd.DataFrame({
    'good_col': [1, 2, 3, 4, 5],
    'sparse_col': [np.nan, np.nan, np.nan, np.nan, 1],
    'ok_col': [1, np.nan, 3, 4, 5],
})
print('Before:', df.columns.tolist())
print('After drop_mostly_na:', drop_mostly_na(df, threshold=0.5).columns.tolist())

Before: ['good_col', 'sparse_col', 'ok_col']
After drop_mostly_na: ['good_col', 'ok_col']

robust_floater and isdigit#

test_values = [42, '3.14', 'hello', None, np.nan, '-7']
for v in test_values:
    print(f'robust_floater({v!r}) = {robust_floater(v)}')

print()
for v in ['123', '3.14', '-5', 'abc', 42]:
    print(f'isdigit({v!r}) = {isdigit(v)}')

robust_floater(42) = 42
robust_floater('3.14') = 3.14
robust_floater('hello') = nan
robust_floater(None) = nan
robust_floater(nan) = nan
robust_floater('-7') = -7.0

isdigit('123') = True
isdigit('3.14') = True
isdigit('-5') = True
isdigit('abc') = False
isdigit(42) = True

flatten and chunks#

nested = np.array([np.array([1, 2, 3]), np.array([4, 5, 6]), np.array([7, 8, 9])])
print('flatten:', flatten(nested))

data = list(range(10))
print('chunks(range(10), 3):', list(chunks(data, 3)))

flatten: [1 2 3 4 5 6 7 8 9]
chunks(range(10), 3): [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

load_json / dump_json with complex numbers#

import tempfile, os

data = {
    'name': 'test',
    'values': np.array([1+2j, 3+4j, 5+6j]),
    'scalar': 42
}

with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
    tmp_path = f.name

dump_json(data, tmp_path)
loaded = load_json(tmp_path)
print('Original complex:', data['values'])
print('Loaded complex:', loaded['values'])
print('Round-trip match:', np.allclose(data['values'], loaded['values']))
os.unlink(tmp_path)

Original complex: [1.+2.j 3.+4.j 5.+6.j]
Loaded complex: [1.+2.j 3.+4.j 5.+6.j]
Round-trip match: True

encode_floats and find_percentile#

nums = np.array([1.23456789, 0.000123, 999.9999])
print('encode_floats:', encode_floats(nums, decimals=3))

percentiles = np.percentile(np.random.randn(10000), range(101))
print(f'find_percentile(0.0) = {find_percentile(0.0, percentiles)}th percentile')

encode_floats: [Decimal('1.235'), Decimal('0.000'), Decimal('1000.000')]
find_percentile(0.0) = 51th percentile

geometric_median#

np.random.seed(42)
points = np.random.randn(200, 2)
# Add an outlier cluster
points = np.vstack([points, np.random.randn(20, 2) + [5, 5]])

gm = geometric_median(points)
mean = np.mean(points, axis=0)

plt.scatter(points[:, 0], points[:, 1], alpha=0.3, label='data')
plt.scatter(*mean, s=200, marker='x', c='red', lw=3, label=f'mean ({mean[0]:.2f}, {mean[1]:.2f})')
plt.scatter(*gm, s=200, marker='*', c='green', lw=3, label=f'geometric median ({gm[0]:.2f}, {gm[1]:.2f})')
plt.legend()
title('geometric_median is robust to outliers')
nicefy()
plt.show()

../_images/5a033792b8870a25ed0de0cd4abdfbab514dba52345fb00448f78ca5f30ddfac.png

DateCodec#

from matviz.datetime_converter import DateCodec
import datetime

codec = DateCodec()

# Python datetime
dt_orig = datetime.datetime(2024, 6, 15, 12, 30, 0)
ns = codec.to_number(dt_orig)
dt_back = codec.from_number(ns)
print(f'datetime: {dt_orig} -> {ns} -> {dt_back}')

# Pandas Timestamp
codec.reset()
ts_orig = pd.Timestamp('2024-06-15 12:30:00')
ns = codec.to_number(ts_orig)
ts_back = codec.from_number(ns)
print(f'Timestamp: {ts_orig} -> {ns} -> {ts_back}')

# numpy datetime64
codec.reset()
np_orig = np.datetime64('2024-06-15T12:30:00')
ns = codec.to_number(np_orig)
np_back = codec.from_number(ns)
print(f'datetime64: {np_orig} -> {ns} -> {np_back}')

datetime: 2024-06-15 12:30:00 -> 1718454600000000000 -> 2024-06-15 12:30:00
Timestamp: 2024-06-15 12:30:00 -> 1718454600000000000 -> 2024-06-15 12:30:00
datetime64: 2024-06-15T12:30:00 -> 1718454600000000000 -> 2024-06-15T12:30:00.000000000

circle_utils: cmake_circle and cget_area#

from matviz.circle_utils import cmake_circle, cget_area
from matviz.viz import cplot, cscatter

np.random.seed(42)
z = np.random.randn(50) + 1j * np.random.randn(50)

z_center, r = cmake_circle(z)
area = cget_area(z)

# Plot points and enclosing circle
cscatter(z, alpha=0.5, label='points')
t = np.linspace(0, 2*np.pi, 100)
circle = z_center + r * np.exp(1j * t)
cplot(circle, 'r-', lw=2, label=f'enclosing circle (r={r:.2f})')
cscatter(np.array([z_center]), s=100, c='red', marker='x', zorder=5)

plt.axis('equal')
plt.legend()
title(f'cmake_circle + cget_area (area={area:.2f})')
nicefy()
plt.show()

../_images/986552d213826a29752f7748d9b08594dddf57ac88835a64c17fb2b105efc258.png