datasus_db.pl_utils

Module with helper functions to work with polars dataframes.

 1"""
 2Module with helper functions to work with polars dataframes.
 3"""
 4
 5import polars as pl
 6from dataclasses import dataclass
 7
 8
 9@dataclass
10class Column:
11    name: str
12    dtype: pl.PolarsDataType
13    strict: bool = True
14
15    def upsert(self, df: pl.DataFrame):
16        return upsert_column(df, self.name, self.dtype, strict=self.strict)
17
18
19@dataclass
20class DateColumn:
21    name: str
22    format: str
23    strict: bool = True
24
25    def upsert(self, df: pl.DataFrame):
26        return upsert_date_column(df, self.name, self.format, strict=self.strict)
27
28
29def upsert_column(df: pl.DataFrame, name: str, dtype: pl.PolarsDataType, strict=True):
30    if name in df.columns:
31        return pl.col(name).cast(dtype, strict=strict)
32    else:
33        return pl.lit(None, dtype).alias(name)
34
35
36def upsert_date_column(df: pl.DataFrame, name: str, format: str, strict=True):
37    dtype = pl.Date
38    if name in df.columns:
39        return pl.col(name).str.to_date(format, strict=strict)
40    else:
41        return pl.lit(None, dtype).alias(name)
42
43
44def to_schema(df: pl.DataFrame, schema: list[Column]):
45    schema_cols = {col.name for col in schema}
46    cols_to_remove = [col for col in df.columns if not col in schema_cols]
47    df = df.drop(cols_to_remove)
48
49    return df.with_columns([col.upsert(df) for col in schema])
50
51
52def rename_columns(df: pl.DataFrame, mapping: dict[str, str]):
53    cur_cols = set(df.columns)
54    mapping_possible = {
55        col: mapping[col] for (col) in mapping.keys() if col in cur_cols
56    }
57
58    if len(mapping_possible) == 0:
59        return df
60    else:
61        return df.rename(mapping_possible)
62
63
64def fill_empty(fill: object, col=pl.col(pl.Utf8)):
65    return pl.when(col.str.len_chars() == 0).then(fill).otherwise(col).name.keep()
66
67
68def fill_text(match: str, fill: object, col=pl.col(pl.Utf8)):
69    return pl.when(col == match).then(fill).otherwise(col).name.keep()
70
71
72def fill_non_numeric(fill: object, col: pl.Expr):
73    return pl.when(col.str.contains(r"\D")).then(fill).otherwise(col).name.keep()
@dataclass
class Column:
10@dataclass
11class Column:
12    name: str
13    dtype: pl.PolarsDataType
14    strict: bool = True
15
16    def upsert(self, df: pl.DataFrame):
17        return upsert_column(df, self.name, self.dtype, strict=self.strict)
Column( name: str, dtype: Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')], strict: bool = True)
name: str
dtype: Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')]
strict: bool = True
def upsert(self, df: polars.dataframe.frame.DataFrame):
16    def upsert(self, df: pl.DataFrame):
17        return upsert_column(df, self.name, self.dtype, strict=self.strict)
@dataclass
class DateColumn:
20@dataclass
21class DateColumn:
22    name: str
23    format: str
24    strict: bool = True
25
26    def upsert(self, df: pl.DataFrame):
27        return upsert_date_column(df, self.name, self.format, strict=self.strict)
DateColumn(name: str, format: str, strict: bool = True)
name: str
format: str
strict: bool = True
def upsert(self, df: polars.dataframe.frame.DataFrame):
26    def upsert(self, df: pl.DataFrame):
27        return upsert_date_column(df, self.name, self.format, strict=self.strict)
def upsert_column( df: polars.dataframe.frame.DataFrame, name: str, dtype: Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')], strict=True):
30def upsert_column(df: pl.DataFrame, name: str, dtype: pl.PolarsDataType, strict=True):
31    if name in df.columns:
32        return pl.col(name).cast(dtype, strict=strict)
33    else:
34        return pl.lit(None, dtype).alias(name)
def upsert_date_column( df: polars.dataframe.frame.DataFrame, name: str, format: str, strict=True):
37def upsert_date_column(df: pl.DataFrame, name: str, format: str, strict=True):
38    dtype = pl.Date
39    if name in df.columns:
40        return pl.col(name).str.to_date(format, strict=strict)
41    else:
42        return pl.lit(None, dtype).alias(name)
def to_schema( df: polars.dataframe.frame.DataFrame, schema: list[Column]):
45def to_schema(df: pl.DataFrame, schema: list[Column]):
46    schema_cols = {col.name for col in schema}
47    cols_to_remove = [col for col in df.columns if not col in schema_cols]
48    df = df.drop(cols_to_remove)
49
50    return df.with_columns([col.upsert(df) for col in schema])
def rename_columns(df: polars.dataframe.frame.DataFrame, mapping: dict[str, str]):
53def rename_columns(df: pl.DataFrame, mapping: dict[str, str]):
54    cur_cols = set(df.columns)
55    mapping_possible = {
56        col: mapping[col] for (col) in mapping.keys() if col in cur_cols
57    }
58
59    if len(mapping_possible) == 0:
60        return df
61    else:
62        return df.rename(mapping_possible)
def fill_empty(fill: object, col=<polars.expr.expr.Expr object>):
65def fill_empty(fill: object, col=pl.col(pl.Utf8)):
66    return pl.when(col.str.len_chars() == 0).then(fill).otherwise(col).name.keep()
def fill_text(match: str, fill: object, col=<polars.expr.expr.Expr object>):
69def fill_text(match: str, fill: object, col=pl.col(pl.Utf8)):
70    return pl.when(col == match).then(fill).otherwise(col).name.keep()
def fill_non_numeric(fill: object, col: polars.expr.expr.Expr):
73def fill_non_numeric(fill: object, col: pl.Expr):
74    return pl.when(col.str.contains(r"\D")).then(fill).otherwise(col).name.keep()