datasus_db.cnv
Module with functions to deal with DATASUS convetion files (*datasus_db.cnv), which are usually file which maps ids to a readable names.
1""" 2Module with functions to deal with DATASUS convetion files (*.cnv), which are usually file which maps ids to a readable names. 3""" 4 5import io 6import re 7import polars as pl 8from .pl_utils import Column, to_schema 9 10 11def to_dataframe(cnv_bytes: bytes, id_dtype=pl.UInt32): 12 df = pl.DataFrame( 13 parse_from_bytes(cnv_bytes, encoding="latin-1"), 14 ).rename({"column_0": "ID", "column_1": "NOME"}) 15 16 return to_schema(df, [Column("ID", id_dtype), Column("NOME", pl.Utf8)]) 17 18 19def parse_from_bytes( 20 bytes: bytes, 21 encoding="utf-8", 22 skip_rows=1, 23 delimiter=r"\s{2,}", 24 id_idx=3, 25 label_idx=2, 26): 27 file = io.TextIOWrapper(io.BytesIO(bytes), encoding=encoding) 28 return parse( 29 file, 30 skip_rows=skip_rows, 31 delimiter=delimiter, 32 id_idx=id_idx, 33 label_idx=label_idx, 34 ) 35 36 37def parse( 38 file: io.TextIOWrapper, skip_rows=1, delimiter=r"\s{2,}", id_idx=3, label_idx=2 39): 40 for i, row in enumerate(file): 41 if i < skip_rows: 42 continue 43 44 split = re.split(delimiter, row) 45 id, label = split[id_idx].split(",")[0], split[label_idx] 46 yield (id, label)
def
to_dataframe(cnv_bytes: bytes, id_dtype=UInt32):
def
parse_from_bytes( bytes: bytes, encoding='utf-8', skip_rows=1, delimiter='\\s{2,}', id_idx=3, label_idx=2):
20def parse_from_bytes( 21 bytes: bytes, 22 encoding="utf-8", 23 skip_rows=1, 24 delimiter=r"\s{2,}", 25 id_idx=3, 26 label_idx=2, 27): 28 file = io.TextIOWrapper(io.BytesIO(bytes), encoding=encoding) 29 return parse( 30 file, 31 skip_rows=skip_rows, 32 delimiter=delimiter, 33 id_idx=id_idx, 34 label_idx=label_idx, 35 )
def
parse( file: _io.TextIOWrapper, skip_rows=1, delimiter='\\s{2,}', id_idx=3, label_idx=2):