datasus_db.datasources.po

 1import polars as pl
 2import logging
 3from ..pl_utils import to_schema, Column, DateColumn
 4from ..datasus import import_from_ftp
 5from ..utils import format_year
 6from ..ftp import fetch_dbc_as_df
 7
 8MAIN_TABLE = "PO"
 9
10
11def import_po(db_file="datasus.db", years=["*"]):
12    """Import PO (Painel de Oncologia) data (since 2013).
13
14    Args:
15        db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db".
16        years (list, optional): list of years for which data will be imported (if available). Eg: `[2013, 2020]` Defaults to ["*"].
17
18    ---
19
20    Extra:
21    - **Data description**: https://github.com/mymatsubara/datasus-db/blob/main/docs/po.pdf
22    - **ftp path**: ftp.datasus.gov.br/dissemin/publicos/IBGE/POP/POPBR*.zip
23    """
24    logging.info(f"⏳ [{MAIN_TABLE}] Starting import...")
25
26    import_from_ftp(
27        [MAIN_TABLE],
28        [
29            f"/dissemin/publicos/PAINEL_ONCOLOGIA/DADOS/POBR{format_year(year, digits=4)}.dbc*"
30            for year in years
31        ],
32        fetch_po,
33        db_file=db_file,
34    )
35
36
37def fetch_po(ftp_path: str):
38    df = fetch_dbc_as_df(ftp_path)
39    return {MAIN_TABLE: map_po(df)}
40
41
42def map_po(df: pl.DataFrame):
43    df = df.with_columns(
44        [
45            pl.when(pl.col(pl.Utf8).str.len_chars() == 0)
46            .then(None)
47            .otherwise(pl.col(pl.Utf8))
48            .name.keep(),
49        ]
50    )
51    return to_schema(
52        df,
53        [
54            Column("ANO_DIAGN", pl.UInt16),
55            Column("ANOMES_DIA", pl.UInt32),
56            Column("ANO_TRATAM", pl.UInt16),
57            Column("ANOMES_TRA", pl.UInt32),
58            Column("UF_RESID", pl.UInt8),
59            Column("MUN_RESID", pl.UInt32),
60            Column("UF_TRATAM", pl.UInt8),
61            Column("MUN_TRATAM", pl.UInt32),
62            Column("UF_DIAGN", pl.UInt8),
63            Column("MUN_DIAG", pl.UInt32),
64            Column("TRATAMENTO", pl.UInt8),
65            Column("DIAGNOSTIC", pl.UInt8),
66            Column("IDADE", pl.UInt8, strict=False),
67            Column("SEXO", pl.Utf8),
68            Column("ESTADIAM", pl.UInt8),
69            Column("CNES_DIAG", pl.UInt32),
70            Column("CNES_TRAT", pl.UInt32),
71            Column("TEMPO_TRAT", pl.Utf8),
72            Column("CNS_PAC", pl.Utf8),
73            Column("DIAG_DETH", pl.Utf8),
74            DateColumn("DT_DIAG", "%d/%m/%Y"),
75            DateColumn("DT_TRAT", "%d/%m/%Y"),
76            DateColumn("DT_NASC", "%d/%m/%Y"),
77        ],
78    )
MAIN_TABLE = 'PO'
def import_po(db_file='datasus.db', years=['*']):
12def import_po(db_file="datasus.db", years=["*"]):
13    """Import PO (Painel de Oncologia) data (since 2013).
14
15    Args:
16        db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db".
17        years (list, optional): list of years for which data will be imported (if available). Eg: `[2013, 2020]` Defaults to ["*"].
18
19    ---
20
21    Extra:
22    - **Data description**: https://github.com/mymatsubara/datasus-db/blob/main/docs/po.pdf
23    - **ftp path**: ftp.datasus.gov.br/dissemin/publicos/IBGE/POP/POPBR*.zip
24    """
25    logging.info(f"⏳ [{MAIN_TABLE}] Starting import...")
26
27    import_from_ftp(
28        [MAIN_TABLE],
29        [
30            f"/dissemin/publicos/PAINEL_ONCOLOGIA/DADOS/POBR{format_year(year, digits=4)}.dbc*"
31            for year in years
32        ],
33        fetch_po,
34        db_file=db_file,
35    )

Import PO (Painel de Oncologia) data (since 2013).

Arguments:
  • db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db".
  • years (list, optional): list of years for which data will be imported (if available). Eg: [2013, 2020] Defaults to ["*"].

Extra:

def fetch_po(ftp_path: str):
38def fetch_po(ftp_path: str):
39    df = fetch_dbc_as_df(ftp_path)
40    return {MAIN_TABLE: map_po(df)}
def map_po(df: polars.dataframe.frame.DataFrame):
43def map_po(df: pl.DataFrame):
44    df = df.with_columns(
45        [
46            pl.when(pl.col(pl.Utf8).str.len_chars() == 0)
47            .then(None)
48            .otherwise(pl.col(pl.Utf8))
49            .name.keep(),
50        ]
51    )
52    return to_schema(
53        df,
54        [
55            Column("ANO_DIAGN", pl.UInt16),
56            Column("ANOMES_DIA", pl.UInt32),
57            Column("ANO_TRATAM", pl.UInt16),
58            Column("ANOMES_TRA", pl.UInt32),
59            Column("UF_RESID", pl.UInt8),
60            Column("MUN_RESID", pl.UInt32),
61            Column("UF_TRATAM", pl.UInt8),
62            Column("MUN_TRATAM", pl.UInt32),
63            Column("UF_DIAGN", pl.UInt8),
64            Column("MUN_DIAG", pl.UInt32),
65            Column("TRATAMENTO", pl.UInt8),
66            Column("DIAGNOSTIC", pl.UInt8),
67            Column("IDADE", pl.UInt8, strict=False),
68            Column("SEXO", pl.Utf8),
69            Column("ESTADIAM", pl.UInt8),
70            Column("CNES_DIAG", pl.UInt32),
71            Column("CNES_TRAT", pl.UInt32),
72            Column("TEMPO_TRAT", pl.Utf8),
73            Column("CNS_PAC", pl.Utf8),
74            Column("DIAG_DETH", pl.Utf8),
75            DateColumn("DT_DIAG", "%d/%m/%Y"),
76            DateColumn("DT_TRAT", "%d/%m/%Y"),
77            DateColumn("DT_NASC", "%d/%m/%Y"),
78        ],
79    )