datasus_db.cnv

Module with functions to deal with DATASUS convetion files (*datasus_db.cnv), which are usually file which maps ids to a readable names.

 1"""
 2Module with functions to deal with DATASUS convetion files (*.cnv), which are usually file which maps ids to a readable names.
 3"""
 4
 5import io
 6import re
 7import polars as pl
 8from .pl_utils import Column, to_schema
 9
10
11def to_dataframe(cnv_bytes: bytes, id_dtype=pl.UInt32):
12    df = pl.DataFrame(
13        parse_from_bytes(cnv_bytes, encoding="latin-1"),
14    ).rename({"column_0": "ID", "column_1": "NOME"})
15
16    return to_schema(df, [Column("ID", id_dtype), Column("NOME", pl.Utf8)])
17
18
19def parse_from_bytes(
20    bytes: bytes,
21    encoding="utf-8",
22    skip_rows=1,
23    delimiter=r"\s{2,}",
24    id_idx=3,
25    label_idx=2,
26):
27    file = io.TextIOWrapper(io.BytesIO(bytes), encoding=encoding)
28    return parse(
29        file,
30        skip_rows=skip_rows,
31        delimiter=delimiter,
32        id_idx=id_idx,
33        label_idx=label_idx,
34    )
35
36
37def parse(
38    file: io.TextIOWrapper, skip_rows=1, delimiter=r"\s{2,}", id_idx=3, label_idx=2
39):
40    for i, row in enumerate(file):
41        if i < skip_rows:
42            continue
43
44        split = re.split(delimiter, row)
45        id, label = split[id_idx].split(",")[0], split[label_idx]
46        yield (id, label)
def to_dataframe(cnv_bytes: bytes, id_dtype=UInt32):
12def to_dataframe(cnv_bytes: bytes, id_dtype=pl.UInt32):
13    df = pl.DataFrame(
14        parse_from_bytes(cnv_bytes, encoding="latin-1"),
15    ).rename({"column_0": "ID", "column_1": "NOME"})
16
17    return to_schema(df, [Column("ID", id_dtype), Column("NOME", pl.Utf8)])
def parse_from_bytes( bytes: bytes, encoding='utf-8', skip_rows=1, delimiter='\\s{2,}', id_idx=3, label_idx=2):
20def parse_from_bytes(
21    bytes: bytes,
22    encoding="utf-8",
23    skip_rows=1,
24    delimiter=r"\s{2,}",
25    id_idx=3,
26    label_idx=2,
27):
28    file = io.TextIOWrapper(io.BytesIO(bytes), encoding=encoding)
29    return parse(
30        file,
31        skip_rows=skip_rows,
32        delimiter=delimiter,
33        id_idx=id_idx,
34        label_idx=label_idx,
35    )
def parse( file: _io.TextIOWrapper, skip_rows=1, delimiter='\\s{2,}', id_idx=3, label_idx=2):
38def parse(
39    file: io.TextIOWrapper, skip_rows=1, delimiter=r"\s{2,}", id_idx=3, label_idx=2
40):
41    for i, row in enumerate(file):
42        if i < skip_rows:
43            continue
44
45        split = re.split(delimiter, row)
46        id, label = split[id_idx].split(",")[0], split[label_idx]
47        yield (id, label)