datasus_db.pl_utils
Module with helper functions to work with polars dataframes.
1""" 2Module with helper functions to work with polars dataframes. 3""" 4 5import polars as pl 6from dataclasses import dataclass 7 8 9@dataclass 10class Column: 11 name: str 12 dtype: pl.PolarsDataType 13 strict: bool = True 14 15 def upsert(self, df: pl.DataFrame): 16 return upsert_column(df, self.name, self.dtype, strict=self.strict) 17 18 19@dataclass 20class DateColumn: 21 name: str 22 format: str 23 strict: bool = True 24 25 def upsert(self, df: pl.DataFrame): 26 return upsert_date_column(df, self.name, self.format, strict=self.strict) 27 28 29def upsert_column(df: pl.DataFrame, name: str, dtype: pl.PolarsDataType, strict=True): 30 if name in df.columns: 31 return pl.col(name).cast(dtype, strict=strict) 32 else: 33 return pl.lit(None, dtype).alias(name) 34 35 36def upsert_date_column(df: pl.DataFrame, name: str, format: str, strict=True): 37 dtype = pl.Date 38 if name in df.columns: 39 return pl.col(name).str.to_date(format, strict=strict) 40 else: 41 return pl.lit(None, dtype).alias(name) 42 43 44def to_schema(df: pl.DataFrame, schema: list[Column]): 45 schema_cols = {col.name for col in schema} 46 cols_to_remove = [col for col in df.columns if not col in schema_cols] 47 df = df.drop(cols_to_remove) 48 49 return df.with_columns([col.upsert(df) for col in schema]) 50 51 52def rename_columns(df: pl.DataFrame, mapping: dict[str, str]): 53 cur_cols = set(df.columns) 54 mapping_possible = { 55 col: mapping[col] for (col) in mapping.keys() if col in cur_cols 56 } 57 58 if len(mapping_possible) == 0: 59 return df 60 else: 61 return df.rename(mapping_possible) 62 63 64def fill_empty(fill: object, col=pl.col(pl.Utf8)): 65 return pl.when(col.str.len_chars() == 0).then(fill).otherwise(col).name.keep() 66 67 68def fill_text(match: str, fill: object, col=pl.col(pl.Utf8)): 69 return pl.when(col == match).then(fill).otherwise(col).name.keep() 70 71 72def fill_non_numeric(fill: object, col: pl.Expr): 73 return pl.when(col.str.contains(r"\D")).then(fill).otherwise(col).name.keep()
@dataclass
class
Column:
10@dataclass 11class Column: 12 name: str 13 dtype: pl.PolarsDataType 14 strict: bool = True 15 16 def upsert(self, df: pl.DataFrame): 17 return upsert_column(df, self.name, self.dtype, strict=self.strict)
@dataclass
class
DateColumn:
def
upsert_column( df: polars.dataframe.frame.DataFrame, name: str, dtype: Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')], strict=True):
def
upsert_date_column( df: polars.dataframe.frame.DataFrame, name: str, format: str, strict=True):
def
rename_columns(df: polars.dataframe.frame.DataFrame, mapping: dict[str, str]):
def
fill_empty(fill: object, col=<polars.expr.expr.Expr object>):
def
fill_text(match: str, fill: object, col=<polars.expr.expr.Expr object>):
def
fill_non_numeric(fill: object, col: polars.expr.expr.Expr):