datasus_db.datasources.sim_do

  1import polars as pl
  2import logging
  3from ..pl_utils import (
  4    upsert_column,
  5    to_schema,
  6    Column,
  7    DateColumn,
  8    rename_columns,
  9    fill_empty,
 10    fill_text,
 11    fill_non_numeric,
 12)
 13from ..datasus import import_from_ftp
 14from ..utils import format_year
 15from ..ftp import fetch_dbc_as_df
 16
 17MAIN_TABLE = "SIM_DO"
 18
 19
 20def import_sim_do(db_file="datasus.db", years=["*"], states=["*"]):
 21    """Import DO (Declaração de Óbito) from SIM (Sistema de informações de Mortalidade).
 22
 23    Args:
 24        db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db".
 25        years (list, optional): list of years for which data will be imported (if available). Eg: `[2012, 2000, 2010]`. Defaults to ["*"].
 26        states (list, optional): list of brazilian 2 letters state for which data will be imported (if available). Eg: `["SP", "RJ"]`. Defaults to ["*"].
 27
 28    ---
 29
 30    Extra:
 31    - **Data description**: https://github.com/mymatsubara/datasus-db/blob/main/docs/sim_do.pdf
 32    - **ftp path non preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/CID10/DORES/DO*.dbc
 33    - **ftp path preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/PRELIM/DORES/DO*.dbc
 34    """
 35    logging.info(f"⏳ [{MAIN_TABLE}] Starting import for non preliminary data...")
 36    import_from_ftp(
 37        [MAIN_TABLE],
 38        [
 39            f"/dissemin/publicos/SIM/CID10/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*"
 40            for year in years
 41            for state in states
 42        ],
 43        fetch_sim_do,
 44        ftp_exclude_regex=r".*/DOBR.*\.dbc",
 45        db_file=db_file,
 46    )
 47
 48    logging.info(f"⏳ [{MAIN_TABLE}] Starting import for preliminary data...")
 49    import_from_ftp(
 50        [MAIN_TABLE],
 51        [
 52            f"/dissemin/publicos/SIM/PRELIM/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*"
 53            for year in years
 54            for state in states
 55        ],
 56        fetch_sim_do,
 57        ftp_exclude_regex=r".*/DOBR.*\.dbc",
 58        db_file=db_file,
 59    )
 60
 61
 62def fetch_sim_do(ftp_path: str):
 63    df = fetch_dbc_as_df(ftp_path)
 64    return {MAIN_TABLE: map_sim_do(df)}
 65
 66
 67def map_sim_do(df: pl.DataFrame):
 68    df = (
 69        df.with_columns(fill_empty(None))
 70        .with_columns(
 71            fill_text("NULL", None),
 72        )
 73        .with_columns(
 74            fill_text("00000000", None),
 75        )
 76        .with_columns(
 77            fill_non_numeric(None, pl.col(["PESO", "NATURAL"])),
 78        )
 79    )
 80    df = rename_columns(df, {"contador": "CONTADOR"})
 81
 82    df = (
 83        df.with_columns(upsert_column(df, "DTCADASTRO", pl.Utf8))
 84        .with_columns(
 85            pl.when(pl.col("DTOBITO").str.len_chars() == 4)
 86            .then("0101" + pl.col("DTOBITO"))
 87            .otherwise(pl.col("DTOBITO"))
 88            .name.keep(),
 89            pl.when(pl.col("DTNASC").str.len_chars() == 4)
 90            .then("0101" + pl.col("DTNASC"))
 91            .otherwise(pl.col("DTNASC"))
 92            .name.keep(),
 93            pl.when(pl.col("DTCADASTRO").str.len_chars() == 7)
 94            .then("0" + pl.col("DTCADASTRO"))
 95            .otherwise(pl.col("DTCADASTRO"))
 96            .name.keep(),
 97        )
 98        .with_columns(
 99            pl.when(pl.col("DTOBITO").str.len_chars() == 6)
100            .then("01" + pl.col("DTOBITO"))
101            .otherwise(pl.col("DTOBITO"))
102            .name.keep(),
103            pl.when(pl.col("DTNASC").str.len_chars() == 6)
104            .then("01" + pl.col("DTNASC"))
105            .otherwise(pl.col("DTNASC"))
106            .name.keep(),
107        )
108    )
109
110    return to_schema(
111        df,
112        [
113            Column("ORIGEM", pl.UInt8),
114            Column("TIPOBITO", pl.UInt8),
115            DateColumn("DTOBITO", "%d%m%Y", strict=False),
116            Column("HORAOBITO", pl.Utf8),
117            Column("NATURAL", pl.UInt32),
118            Column("CODMUNNATU", pl.UInt32),
119            DateColumn("DTNASC", "%d%m%Y", strict=False),
120            Column("IDADE", pl.UInt16),
121            Column("SEXO", pl.UInt8),
122            Column("RACACOR", pl.UInt8),
123            Column("ESTCIV", pl.UInt8, strict=False),
124            Column("ESC", pl.UInt8, strict=False),
125            Column("ESC2010", pl.UInt8, strict=False),
126            Column("SERIESCFAL", pl.UInt8),
127            Column("OCUP", pl.Utf8),
128            Column("CODMUNRES", pl.UInt32),
129            Column("LOCOCOR", pl.UInt8),
130            Column("CODESTAB", pl.UInt32, strict=False),
131            Column("ESTABDESCR", pl.Utf8),
132            Column("CODMUNOCOR", pl.UInt32),
133            Column("IDADEMAE", pl.UInt16, strict=False),
134            Column("ESCMAE", pl.Int8, strict=False),
135            Column("ESCMAE2010", pl.Int8, strict=False),
136            Column("SERIESCMAE", pl.Int8),
137            Column("OCUPMAE", pl.Utf8),
138            Column("QTDFILVIVO", pl.UInt8, strict=False),
139            Column("QTDFILMORT", pl.UInt8, strict=False),
140            Column("GRAVIDEZ", pl.UInt8),
141            Column("SEMAGESTAC", pl.UInt8),
142            Column("GESTACAO", pl.UInt8, strict=False),
143            Column("PARTO", pl.UInt8),
144            Column("OBITOPARTO", pl.UInt8),
145            Column("PESO", pl.UInt32),
146            Column("TPMORTEOCO", pl.UInt8),
147            Column("OBITOGRAV", pl.UInt8),
148            Column("OBITOPUERP", pl.UInt8),
149            Column("ASSISTMED", pl.UInt8),
150            Column("EXAME", pl.UInt8),
151            Column("CIRURGIA", pl.UInt8),
152            Column("NECROPSIA", pl.UInt8),
153            Column("LINHAA", pl.Utf8),
154            Column("LINHAB", pl.Utf8),
155            Column("LINHAC", pl.Utf8),
156            Column("LINHAD", pl.Utf8),
157            Column("LINHAII", pl.Utf8),
158            Column("CAUSABAS", pl.Utf8),
159            Column("CB_PRE", pl.Utf8),
160            Column("COMUNSVOIM", pl.Utf8),
161            DateColumn("DTATESTADO", "%d%m%Y", strict=False),
162            Column("CIRCOBITO", pl.UInt8, strict=False),
163            Column("ACIDTRAB", pl.UInt8),
164            Column("FONTE", pl.Utf8),
165            Column("NUMEROLOTE", pl.Utf8),
166            Column("TPPOS", pl.Utf8),
167            DateColumn("DTINVESTIG", "%d%m%Y"),
168            Column("CAUSABAS_O", pl.Utf8),
169            DateColumn("DTCADASTRO", "%d%m%Y"),
170            Column("ATESTANTE", pl.UInt8),
171            Column("STCODIFICA", pl.Utf8),
172            Column("CODIFICADO", pl.Utf8),
173            Column("VERSAOSIST", pl.Utf8),
174            Column("VERSAOSCB", pl.Utf8),
175            Column("FONTEINV", pl.Utf8),
176            DateColumn("DTRECEBIM", "%d%m%Y"),
177            Column("ATESTADO", pl.Utf8),
178            DateColumn("DTRECORIGA", "%d%m%Y"),
179            Column("CAUSAMAT", pl.Utf8),
180            Column("ESCMAEAGR1", pl.UInt8),
181            Column("ESCFALAGR1", pl.UInt8),
182            Column("STDOEPIDEM", pl.UInt8),
183            Column("STDONOVA", pl.UInt8),
184            Column("DIFDATA", pl.UInt16),
185            Column("NUDIASOBCO", pl.UInt16),
186            Column("NUDIASOBIN", pl.UInt16),
187            DateColumn("DTCADINV", "%d%m%Y"),
188            Column("TPOBITOCOR", pl.UInt8),
189            DateColumn("DTCONINV", "%d%m%Y"),
190            Column("FONTES", pl.Utf8),
191            Column("TPRESGINFO", pl.UInt8),
192            Column("TPNIVELINV", pl.Utf8),
193            Column("NUDIASINF", pl.UInt16),
194            DateColumn("DTCADINF", "%d%m%Y"),
195            Column("MORTEPARTO", pl.UInt8),
196            DateColumn("DTCONCASO", "%d%m%Y"),
197            Column("FONTESINF", pl.Utf8),
198            Column("ALTCAUSA", pl.UInt8),
199            Column("CONTADOR", pl.UInt32),
200        ],
201    )
MAIN_TABLE = 'SIM_DO'
def import_sim_do(db_file='datasus.db', years=['*'], states=['*']):
21def import_sim_do(db_file="datasus.db", years=["*"], states=["*"]):
22    """Import DO (Declaração de Óbito) from SIM (Sistema de informações de Mortalidade).
23
24    Args:
25        db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db".
26        years (list, optional): list of years for which data will be imported (if available). Eg: `[2012, 2000, 2010]`. Defaults to ["*"].
27        states (list, optional): list of brazilian 2 letters state for which data will be imported (if available). Eg: `["SP", "RJ"]`. Defaults to ["*"].
28
29    ---
30
31    Extra:
32    - **Data description**: https://github.com/mymatsubara/datasus-db/blob/main/docs/sim_do.pdf
33    - **ftp path non preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/CID10/DORES/DO*.dbc
34    - **ftp path preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/PRELIM/DORES/DO*.dbc
35    """
36    logging.info(f"⏳ [{MAIN_TABLE}] Starting import for non preliminary data...")
37    import_from_ftp(
38        [MAIN_TABLE],
39        [
40            f"/dissemin/publicos/SIM/CID10/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*"
41            for year in years
42            for state in states
43        ],
44        fetch_sim_do,
45        ftp_exclude_regex=r".*/DOBR.*\.dbc",
46        db_file=db_file,
47    )
48
49    logging.info(f"⏳ [{MAIN_TABLE}] Starting import for preliminary data...")
50    import_from_ftp(
51        [MAIN_TABLE],
52        [
53            f"/dissemin/publicos/SIM/PRELIM/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*"
54            for year in years
55            for state in states
56        ],
57        fetch_sim_do,
58        ftp_exclude_regex=r".*/DOBR.*\.dbc",
59        db_file=db_file,
60    )

Import DO (Declaração de Óbito) from SIM (Sistema de informações de Mortalidade).

Arguments:
  • db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db".
  • years (list, optional): list of years for which data will be imported (if available). Eg: [2012, 2000, 2010]. Defaults to ["*"].
  • states (list, optional): list of brazilian 2 letters state for which data will be imported (if available). Eg: ["SP", "RJ"]. Defaults to ["*"].

Extra:

def fetch_sim_do(ftp_path: str):
63def fetch_sim_do(ftp_path: str):
64    df = fetch_dbc_as_df(ftp_path)
65    return {MAIN_TABLE: map_sim_do(df)}
def map_sim_do(df: polars.dataframe.frame.DataFrame):
 68def map_sim_do(df: pl.DataFrame):
 69    df = (
 70        df.with_columns(fill_empty(None))
 71        .with_columns(
 72            fill_text("NULL", None),
 73        )
 74        .with_columns(
 75            fill_text("00000000", None),
 76        )
 77        .with_columns(
 78            fill_non_numeric(None, pl.col(["PESO", "NATURAL"])),
 79        )
 80    )
 81    df = rename_columns(df, {"contador": "CONTADOR"})
 82
 83    df = (
 84        df.with_columns(upsert_column(df, "DTCADASTRO", pl.Utf8))
 85        .with_columns(
 86            pl.when(pl.col("DTOBITO").str.len_chars() == 4)
 87            .then("0101" + pl.col("DTOBITO"))
 88            .otherwise(pl.col("DTOBITO"))
 89            .name.keep(),
 90            pl.when(pl.col("DTNASC").str.len_chars() == 4)
 91            .then("0101" + pl.col("DTNASC"))
 92            .otherwise(pl.col("DTNASC"))
 93            .name.keep(),
 94            pl.when(pl.col("DTCADASTRO").str.len_chars() == 7)
 95            .then("0" + pl.col("DTCADASTRO"))
 96            .otherwise(pl.col("DTCADASTRO"))
 97            .name.keep(),
 98        )
 99        .with_columns(
100            pl.when(pl.col("DTOBITO").str.len_chars() == 6)
101            .then("01" + pl.col("DTOBITO"))
102            .otherwise(pl.col("DTOBITO"))
103            .name.keep(),
104            pl.when(pl.col("DTNASC").str.len_chars() == 6)
105            .then("01" + pl.col("DTNASC"))
106            .otherwise(pl.col("DTNASC"))
107            .name.keep(),
108        )
109    )
110
111    return to_schema(
112        df,
113        [
114            Column("ORIGEM", pl.UInt8),
115            Column("TIPOBITO", pl.UInt8),
116            DateColumn("DTOBITO", "%d%m%Y", strict=False),
117            Column("HORAOBITO", pl.Utf8),
118            Column("NATURAL", pl.UInt32),
119            Column("CODMUNNATU", pl.UInt32),
120            DateColumn("DTNASC", "%d%m%Y", strict=False),
121            Column("IDADE", pl.UInt16),
122            Column("SEXO", pl.UInt8),
123            Column("RACACOR", pl.UInt8),
124            Column("ESTCIV", pl.UInt8, strict=False),
125            Column("ESC", pl.UInt8, strict=False),
126            Column("ESC2010", pl.UInt8, strict=False),
127            Column("SERIESCFAL", pl.UInt8),
128            Column("OCUP", pl.Utf8),
129            Column("CODMUNRES", pl.UInt32),
130            Column("LOCOCOR", pl.UInt8),
131            Column("CODESTAB", pl.UInt32, strict=False),
132            Column("ESTABDESCR", pl.Utf8),
133            Column("CODMUNOCOR", pl.UInt32),
134            Column("IDADEMAE", pl.UInt16, strict=False),
135            Column("ESCMAE", pl.Int8, strict=False),
136            Column("ESCMAE2010", pl.Int8, strict=False),
137            Column("SERIESCMAE", pl.Int8),
138            Column("OCUPMAE", pl.Utf8),
139            Column("QTDFILVIVO", pl.UInt8, strict=False),
140            Column("QTDFILMORT", pl.UInt8, strict=False),
141            Column("GRAVIDEZ", pl.UInt8),
142            Column("SEMAGESTAC", pl.UInt8),
143            Column("GESTACAO", pl.UInt8, strict=False),
144            Column("PARTO", pl.UInt8),
145            Column("OBITOPARTO", pl.UInt8),
146            Column("PESO", pl.UInt32),
147            Column("TPMORTEOCO", pl.UInt8),
148            Column("OBITOGRAV", pl.UInt8),
149            Column("OBITOPUERP", pl.UInt8),
150            Column("ASSISTMED", pl.UInt8),
151            Column("EXAME", pl.UInt8),
152            Column("CIRURGIA", pl.UInt8),
153            Column("NECROPSIA", pl.UInt8),
154            Column("LINHAA", pl.Utf8),
155            Column("LINHAB", pl.Utf8),
156            Column("LINHAC", pl.Utf8),
157            Column("LINHAD", pl.Utf8),
158            Column("LINHAII", pl.Utf8),
159            Column("CAUSABAS", pl.Utf8),
160            Column("CB_PRE", pl.Utf8),
161            Column("COMUNSVOIM", pl.Utf8),
162            DateColumn("DTATESTADO", "%d%m%Y", strict=False),
163            Column("CIRCOBITO", pl.UInt8, strict=False),
164            Column("ACIDTRAB", pl.UInt8),
165            Column("FONTE", pl.Utf8),
166            Column("NUMEROLOTE", pl.Utf8),
167            Column("TPPOS", pl.Utf8),
168            DateColumn("DTINVESTIG", "%d%m%Y"),
169            Column("CAUSABAS_O", pl.Utf8),
170            DateColumn("DTCADASTRO", "%d%m%Y"),
171            Column("ATESTANTE", pl.UInt8),
172            Column("STCODIFICA", pl.Utf8),
173            Column("CODIFICADO", pl.Utf8),
174            Column("VERSAOSIST", pl.Utf8),
175            Column("VERSAOSCB", pl.Utf8),
176            Column("FONTEINV", pl.Utf8),
177            DateColumn("DTRECEBIM", "%d%m%Y"),
178            Column("ATESTADO", pl.Utf8),
179            DateColumn("DTRECORIGA", "%d%m%Y"),
180            Column("CAUSAMAT", pl.Utf8),
181            Column("ESCMAEAGR1", pl.UInt8),
182            Column("ESCFALAGR1", pl.UInt8),
183            Column("STDOEPIDEM", pl.UInt8),
184            Column("STDONOVA", pl.UInt8),
185            Column("DIFDATA", pl.UInt16),
186            Column("NUDIASOBCO", pl.UInt16),
187            Column("NUDIASOBIN", pl.UInt16),
188            DateColumn("DTCADINV", "%d%m%Y"),
189            Column("TPOBITOCOR", pl.UInt8),
190            DateColumn("DTCONINV", "%d%m%Y"),
191            Column("FONTES", pl.Utf8),
192            Column("TPRESGINFO", pl.UInt8),
193            Column("TPNIVELINV", pl.Utf8),
194            Column("NUDIASINF", pl.UInt16),
195            DateColumn("DTCADINF", "%d%m%Y"),
196            Column("MORTEPARTO", pl.UInt8),
197            DateColumn("DTCONCASO", "%d%m%Y"),
198            Column("FONTESINF", pl.Utf8),
199            Column("ALTCAUSA", pl.UInt8),
200            Column("CONTADOR", pl.UInt32),
201        ],
202    )