datasus_db.datasources.sim_do
1import polars as pl 2import logging 3from ..pl_utils import ( 4 upsert_column, 5 to_schema, 6 Column, 7 DateColumn, 8 rename_columns, 9 fill_empty, 10 fill_text, 11 fill_non_numeric, 12) 13from ..datasus import import_from_ftp 14from ..utils import format_year 15from ..ftp import fetch_dbc_as_df 16 17MAIN_TABLE = "SIM_DO" 18 19 20def import_sim_do(db_file="datasus.db", years=["*"], states=["*"]): 21 """Import DO (Declaração de Óbito) from SIM (Sistema de informações de Mortalidade). 22 23 Args: 24 db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db". 25 years (list, optional): list of years for which data will be imported (if available). Eg: `[2012, 2000, 2010]`. Defaults to ["*"]. 26 states (list, optional): list of brazilian 2 letters state for which data will be imported (if available). Eg: `["SP", "RJ"]`. Defaults to ["*"]. 27 28 --- 29 30 Extra: 31 - **Data description**: https://github.com/mymatsubara/datasus-db/blob/main/docs/sim_do.pdf 32 - **ftp path non preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/CID10/DORES/DO*.dbc 33 - **ftp path preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/PRELIM/DORES/DO*.dbc 34 """ 35 logging.info(f"⏳ [{MAIN_TABLE}] Starting import for non preliminary data...") 36 import_from_ftp( 37 [MAIN_TABLE], 38 [ 39 f"/dissemin/publicos/SIM/CID10/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*" 40 for year in years 41 for state in states 42 ], 43 fetch_sim_do, 44 ftp_exclude_regex=r".*/DOBR.*\.dbc", 45 db_file=db_file, 46 ) 47 48 logging.info(f"⏳ [{MAIN_TABLE}] Starting import for preliminary data...") 49 import_from_ftp( 50 [MAIN_TABLE], 51 [ 52 f"/dissemin/publicos/SIM/PRELIM/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*" 53 for year in years 54 for state in states 55 ], 56 fetch_sim_do, 57 ftp_exclude_regex=r".*/DOBR.*\.dbc", 58 db_file=db_file, 59 ) 60 61 62def fetch_sim_do(ftp_path: str): 63 df = fetch_dbc_as_df(ftp_path) 64 return {MAIN_TABLE: map_sim_do(df)} 65 66 67def map_sim_do(df: pl.DataFrame): 68 df = ( 69 df.with_columns(fill_empty(None)) 70 .with_columns( 71 fill_text("NULL", None), 72 ) 73 .with_columns( 74 fill_text("00000000", None), 75 ) 76 .with_columns( 77 fill_non_numeric(None, pl.col(["PESO", "NATURAL"])), 78 ) 79 ) 80 df = rename_columns(df, {"contador": "CONTADOR"}) 81 82 df = ( 83 df.with_columns(upsert_column(df, "DTCADASTRO", pl.Utf8)) 84 .with_columns( 85 pl.when(pl.col("DTOBITO").str.len_chars() == 4) 86 .then("0101" + pl.col("DTOBITO")) 87 .otherwise(pl.col("DTOBITO")) 88 .name.keep(), 89 pl.when(pl.col("DTNASC").str.len_chars() == 4) 90 .then("0101" + pl.col("DTNASC")) 91 .otherwise(pl.col("DTNASC")) 92 .name.keep(), 93 pl.when(pl.col("DTCADASTRO").str.len_chars() == 7) 94 .then("0" + pl.col("DTCADASTRO")) 95 .otherwise(pl.col("DTCADASTRO")) 96 .name.keep(), 97 ) 98 .with_columns( 99 pl.when(pl.col("DTOBITO").str.len_chars() == 6) 100 .then("01" + pl.col("DTOBITO")) 101 .otherwise(pl.col("DTOBITO")) 102 .name.keep(), 103 pl.when(pl.col("DTNASC").str.len_chars() == 6) 104 .then("01" + pl.col("DTNASC")) 105 .otherwise(pl.col("DTNASC")) 106 .name.keep(), 107 ) 108 ) 109 110 return to_schema( 111 df, 112 [ 113 Column("ORIGEM", pl.UInt8), 114 Column("TIPOBITO", pl.UInt8), 115 DateColumn("DTOBITO", "%d%m%Y", strict=False), 116 Column("HORAOBITO", pl.Utf8), 117 Column("NATURAL", pl.UInt32), 118 Column("CODMUNNATU", pl.UInt32), 119 DateColumn("DTNASC", "%d%m%Y", strict=False), 120 Column("IDADE", pl.UInt16), 121 Column("SEXO", pl.UInt8), 122 Column("RACACOR", pl.UInt8), 123 Column("ESTCIV", pl.UInt8, strict=False), 124 Column("ESC", pl.UInt8, strict=False), 125 Column("ESC2010", pl.UInt8, strict=False), 126 Column("SERIESCFAL", pl.UInt8), 127 Column("OCUP", pl.Utf8), 128 Column("CODMUNRES", pl.UInt32), 129 Column("LOCOCOR", pl.UInt8), 130 Column("CODESTAB", pl.UInt32, strict=False), 131 Column("ESTABDESCR", pl.Utf8), 132 Column("CODMUNOCOR", pl.UInt32), 133 Column("IDADEMAE", pl.UInt16, strict=False), 134 Column("ESCMAE", pl.Int8, strict=False), 135 Column("ESCMAE2010", pl.Int8, strict=False), 136 Column("SERIESCMAE", pl.Int8), 137 Column("OCUPMAE", pl.Utf8), 138 Column("QTDFILVIVO", pl.UInt8, strict=False), 139 Column("QTDFILMORT", pl.UInt8, strict=False), 140 Column("GRAVIDEZ", pl.UInt8), 141 Column("SEMAGESTAC", pl.UInt8), 142 Column("GESTACAO", pl.UInt8, strict=False), 143 Column("PARTO", pl.UInt8), 144 Column("OBITOPARTO", pl.UInt8), 145 Column("PESO", pl.UInt32), 146 Column("TPMORTEOCO", pl.UInt8), 147 Column("OBITOGRAV", pl.UInt8), 148 Column("OBITOPUERP", pl.UInt8), 149 Column("ASSISTMED", pl.UInt8), 150 Column("EXAME", pl.UInt8), 151 Column("CIRURGIA", pl.UInt8), 152 Column("NECROPSIA", pl.UInt8), 153 Column("LINHAA", pl.Utf8), 154 Column("LINHAB", pl.Utf8), 155 Column("LINHAC", pl.Utf8), 156 Column("LINHAD", pl.Utf8), 157 Column("LINHAII", pl.Utf8), 158 Column("CAUSABAS", pl.Utf8), 159 Column("CB_PRE", pl.Utf8), 160 Column("COMUNSVOIM", pl.Utf8), 161 DateColumn("DTATESTADO", "%d%m%Y", strict=False), 162 Column("CIRCOBITO", pl.UInt8, strict=False), 163 Column("ACIDTRAB", pl.UInt8), 164 Column("FONTE", pl.Utf8), 165 Column("NUMEROLOTE", pl.Utf8), 166 Column("TPPOS", pl.Utf8), 167 DateColumn("DTINVESTIG", "%d%m%Y"), 168 Column("CAUSABAS_O", pl.Utf8), 169 DateColumn("DTCADASTRO", "%d%m%Y"), 170 Column("ATESTANTE", pl.UInt8), 171 Column("STCODIFICA", pl.Utf8), 172 Column("CODIFICADO", pl.Utf8), 173 Column("VERSAOSIST", pl.Utf8), 174 Column("VERSAOSCB", pl.Utf8), 175 Column("FONTEINV", pl.Utf8), 176 DateColumn("DTRECEBIM", "%d%m%Y"), 177 Column("ATESTADO", pl.Utf8), 178 DateColumn("DTRECORIGA", "%d%m%Y"), 179 Column("CAUSAMAT", pl.Utf8), 180 Column("ESCMAEAGR1", pl.UInt8), 181 Column("ESCFALAGR1", pl.UInt8), 182 Column("STDOEPIDEM", pl.UInt8), 183 Column("STDONOVA", pl.UInt8), 184 Column("DIFDATA", pl.UInt16), 185 Column("NUDIASOBCO", pl.UInt16), 186 Column("NUDIASOBIN", pl.UInt16), 187 DateColumn("DTCADINV", "%d%m%Y"), 188 Column("TPOBITOCOR", pl.UInt8), 189 DateColumn("DTCONINV", "%d%m%Y"), 190 Column("FONTES", pl.Utf8), 191 Column("TPRESGINFO", pl.UInt8), 192 Column("TPNIVELINV", pl.Utf8), 193 Column("NUDIASINF", pl.UInt16), 194 DateColumn("DTCADINF", "%d%m%Y"), 195 Column("MORTEPARTO", pl.UInt8), 196 DateColumn("DTCONCASO", "%d%m%Y"), 197 Column("FONTESINF", pl.Utf8), 198 Column("ALTCAUSA", pl.UInt8), 199 Column("CONTADOR", pl.UInt32), 200 ], 201 )
MAIN_TABLE =
'SIM_DO'
def
import_sim_do(db_file='datasus.db', years=['*'], states=['*']):
21def import_sim_do(db_file="datasus.db", years=["*"], states=["*"]): 22 """Import DO (Declaração de Óbito) from SIM (Sistema de informações de Mortalidade). 23 24 Args: 25 db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db". 26 years (list, optional): list of years for which data will be imported (if available). Eg: `[2012, 2000, 2010]`. Defaults to ["*"]. 27 states (list, optional): list of brazilian 2 letters state for which data will be imported (if available). Eg: `["SP", "RJ"]`. Defaults to ["*"]. 28 29 --- 30 31 Extra: 32 - **Data description**: https://github.com/mymatsubara/datasus-db/blob/main/docs/sim_do.pdf 33 - **ftp path non preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/CID10/DORES/DO*.dbc 34 - **ftp path preliminary data**: ftp.datasus.gov.br/dissemin/publicos/SIM/PRELIM/DORES/DO*.dbc 35 """ 36 logging.info(f"⏳ [{MAIN_TABLE}] Starting import for non preliminary data...") 37 import_from_ftp( 38 [MAIN_TABLE], 39 [ 40 f"/dissemin/publicos/SIM/CID10/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*" 41 for year in years 42 for state in states 43 ], 44 fetch_sim_do, 45 ftp_exclude_regex=r".*/DOBR.*\.dbc", 46 db_file=db_file, 47 ) 48 49 logging.info(f"⏳ [{MAIN_TABLE}] Starting import for preliminary data...") 50 import_from_ftp( 51 [MAIN_TABLE], 52 [ 53 f"/dissemin/publicos/SIM/PRELIM/DORES/DO{state.upper()}{format_year(year, digits=4)}.dbc*" 54 for year in years 55 for state in states 56 ], 57 fetch_sim_do, 58 ftp_exclude_regex=r".*/DOBR.*\.dbc", 59 db_file=db_file, 60 )
Import DO (Declaração de Óbito) from SIM (Sistema de informações de Mortalidade).
Arguments:
- db_file (str, optional): path to the duckdb file in which the data will be imported to. Defaults to "datasus.db".
- years (list, optional): list of years for which data will be imported (if available). Eg:
[2012, 2000, 2010]
. Defaults to ["*"]. - states (list, optional): list of brazilian 2 letters state for which data will be imported (if available). Eg:
["SP", "RJ"]
. Defaults to ["*"].
Extra:
- Data description: https://github.com/mymatsubara/datasus-db/blob/main/docs/sim_do.pdf
- ftp path non preliminary data: ftp.datasus.gov.br/dissemin/publicos/SIM/CID10/DORES/DO*.dbc
- ftp path preliminary data: ftp.datasus.gov.br/dissemin/publicos/SIM/PRELIM/DORES/DO*.dbc
def
fetch_sim_do(ftp_path: str):
def
map_sim_do(df: polars.dataframe.frame.DataFrame):
68def map_sim_do(df: pl.DataFrame): 69 df = ( 70 df.with_columns(fill_empty(None)) 71 .with_columns( 72 fill_text("NULL", None), 73 ) 74 .with_columns( 75 fill_text("00000000", None), 76 ) 77 .with_columns( 78 fill_non_numeric(None, pl.col(["PESO", "NATURAL"])), 79 ) 80 ) 81 df = rename_columns(df, {"contador": "CONTADOR"}) 82 83 df = ( 84 df.with_columns(upsert_column(df, "DTCADASTRO", pl.Utf8)) 85 .with_columns( 86 pl.when(pl.col("DTOBITO").str.len_chars() == 4) 87 .then("0101" + pl.col("DTOBITO")) 88 .otherwise(pl.col("DTOBITO")) 89 .name.keep(), 90 pl.when(pl.col("DTNASC").str.len_chars() == 4) 91 .then("0101" + pl.col("DTNASC")) 92 .otherwise(pl.col("DTNASC")) 93 .name.keep(), 94 pl.when(pl.col("DTCADASTRO").str.len_chars() == 7) 95 .then("0" + pl.col("DTCADASTRO")) 96 .otherwise(pl.col("DTCADASTRO")) 97 .name.keep(), 98 ) 99 .with_columns( 100 pl.when(pl.col("DTOBITO").str.len_chars() == 6) 101 .then("01" + pl.col("DTOBITO")) 102 .otherwise(pl.col("DTOBITO")) 103 .name.keep(), 104 pl.when(pl.col("DTNASC").str.len_chars() == 6) 105 .then("01" + pl.col("DTNASC")) 106 .otherwise(pl.col("DTNASC")) 107 .name.keep(), 108 ) 109 ) 110 111 return to_schema( 112 df, 113 [ 114 Column("ORIGEM", pl.UInt8), 115 Column("TIPOBITO", pl.UInt8), 116 DateColumn("DTOBITO", "%d%m%Y", strict=False), 117 Column("HORAOBITO", pl.Utf8), 118 Column("NATURAL", pl.UInt32), 119 Column("CODMUNNATU", pl.UInt32), 120 DateColumn("DTNASC", "%d%m%Y", strict=False), 121 Column("IDADE", pl.UInt16), 122 Column("SEXO", pl.UInt8), 123 Column("RACACOR", pl.UInt8), 124 Column("ESTCIV", pl.UInt8, strict=False), 125 Column("ESC", pl.UInt8, strict=False), 126 Column("ESC2010", pl.UInt8, strict=False), 127 Column("SERIESCFAL", pl.UInt8), 128 Column("OCUP", pl.Utf8), 129 Column("CODMUNRES", pl.UInt32), 130 Column("LOCOCOR", pl.UInt8), 131 Column("CODESTAB", pl.UInt32, strict=False), 132 Column("ESTABDESCR", pl.Utf8), 133 Column("CODMUNOCOR", pl.UInt32), 134 Column("IDADEMAE", pl.UInt16, strict=False), 135 Column("ESCMAE", pl.Int8, strict=False), 136 Column("ESCMAE2010", pl.Int8, strict=False), 137 Column("SERIESCMAE", pl.Int8), 138 Column("OCUPMAE", pl.Utf8), 139 Column("QTDFILVIVO", pl.UInt8, strict=False), 140 Column("QTDFILMORT", pl.UInt8, strict=False), 141 Column("GRAVIDEZ", pl.UInt8), 142 Column("SEMAGESTAC", pl.UInt8), 143 Column("GESTACAO", pl.UInt8, strict=False), 144 Column("PARTO", pl.UInt8), 145 Column("OBITOPARTO", pl.UInt8), 146 Column("PESO", pl.UInt32), 147 Column("TPMORTEOCO", pl.UInt8), 148 Column("OBITOGRAV", pl.UInt8), 149 Column("OBITOPUERP", pl.UInt8), 150 Column("ASSISTMED", pl.UInt8), 151 Column("EXAME", pl.UInt8), 152 Column("CIRURGIA", pl.UInt8), 153 Column("NECROPSIA", pl.UInt8), 154 Column("LINHAA", pl.Utf8), 155 Column("LINHAB", pl.Utf8), 156 Column("LINHAC", pl.Utf8), 157 Column("LINHAD", pl.Utf8), 158 Column("LINHAII", pl.Utf8), 159 Column("CAUSABAS", pl.Utf8), 160 Column("CB_PRE", pl.Utf8), 161 Column("COMUNSVOIM", pl.Utf8), 162 DateColumn("DTATESTADO", "%d%m%Y", strict=False), 163 Column("CIRCOBITO", pl.UInt8, strict=False), 164 Column("ACIDTRAB", pl.UInt8), 165 Column("FONTE", pl.Utf8), 166 Column("NUMEROLOTE", pl.Utf8), 167 Column("TPPOS", pl.Utf8), 168 DateColumn("DTINVESTIG", "%d%m%Y"), 169 Column("CAUSABAS_O", pl.Utf8), 170 DateColumn("DTCADASTRO", "%d%m%Y"), 171 Column("ATESTANTE", pl.UInt8), 172 Column("STCODIFICA", pl.Utf8), 173 Column("CODIFICADO", pl.Utf8), 174 Column("VERSAOSIST", pl.Utf8), 175 Column("VERSAOSCB", pl.Utf8), 176 Column("FONTEINV", pl.Utf8), 177 DateColumn("DTRECEBIM", "%d%m%Y"), 178 Column("ATESTADO", pl.Utf8), 179 DateColumn("DTRECORIGA", "%d%m%Y"), 180 Column("CAUSAMAT", pl.Utf8), 181 Column("ESCMAEAGR1", pl.UInt8), 182 Column("ESCFALAGR1", pl.UInt8), 183 Column("STDOEPIDEM", pl.UInt8), 184 Column("STDONOVA", pl.UInt8), 185 Column("DIFDATA", pl.UInt16), 186 Column("NUDIASOBCO", pl.UInt16), 187 Column("NUDIASOBIN", pl.UInt16), 188 DateColumn("DTCADINV", "%d%m%Y"), 189 Column("TPOBITOCOR", pl.UInt8), 190 DateColumn("DTCONINV", "%d%m%Y"), 191 Column("FONTES", pl.Utf8), 192 Column("TPRESGINFO", pl.UInt8), 193 Column("TPNIVELINV", pl.Utf8), 194 Column("NUDIASINF", pl.UInt16), 195 DateColumn("DTCADINF", "%d%m%Y"), 196 Column("MORTEPARTO", pl.UInt8), 197 DateColumn("DTCONCASO", "%d%m%Y"), 198 Column("FONTESINF", pl.Utf8), 199 Column("ALTCAUSA", pl.UInt8), 200 Column("CONTADOR", pl.UInt32), 201 ], 202 )