diff --git a/Procesamiento-Datos-2/README.md b/Procesamiento-Datos-2/README.md new file mode 100644 index 0000000..136dad8 --- /dev/null +++ b/Procesamiento-Datos-2/README.md @@ -0,0 +1,53 @@ +# Fase 2 — Procesamiento y Combinación por Año (Adicciones ↔ Violencia) + +Esta fase limpia y normaliza los datos cargados en PostgreSQL, elimina outliers y genera un **dataset combinado por año** listo para análisis avanzado. + +--- + +## Ejecución + +1) Ajusta la conexión en el script si es necesario (host/puerto/DB). +2) Ejecuta el procesador: + +```bash +python3 process.py +``` + +--- + +## Descripción Técnica + +- Lectura de tablas desde **PostgreSQL**. +- Normalización de columnas (incluye mapeo de \`año\` desde alias como \`a_o\`, \`anio\`, etc.). +- Limpieza de valores vacíos y **eliminación de outliers (IQR)**. +- **Resumen estadístico por dataset** (solo variables numéricas). +- **Merge por \`año\`** para generar un dataset ancho (outer join). +- Salidas en CSV para su uso en Fase 3. + +--- + +## Requisitos + +Archivo \`requirements.txt\`: + +```txt +pandas +sqlalchemy +psycopg2-binary +numpy +``` + +Instalación rápida: + +```bash +pip install -r requirements.txt +``` + +--- + +## Salida + +- `resumen_estadistico_por_dataset.csv` — Descriptivos numéricos por tabla. +- `dataset_combinado_por_año.csv` — Dataset ancho fusionado por año (limpio y sin outliers). + +*Esta fase deja los datos listos para el análisis estadístico extensivo de la Fase 3.* diff --git a/Procesamiento-Datos-2/dataset_combinado_por_año.csv b/Procesamiento-Datos-2/dataset_combinado_por_año.csv new file mode 100644 index 0000000..3368bc7 --- /dev/null +++ b/Procesamiento-Datos-2/dataset_combinado_por_año.csv @@ -0,0 +1,91 @@ +año,provincia,almac_n_m_quinas,bingo,bar_esp_,bolera,casinos,camping,centro_de_ocio,caf_teatro,discoteca,bares,cafeter_as,pub,centro_recreativo_familiar,restaurante,sal_n_de_juego,casas_de_apuestas,c_rner_ap_,zona_ap_,almac_n_apuestas,total_a_31_12_,prov,altas_otras_causas,altas_petici_n_propia,bajas_otras_causas,bajas_petici_n_propia,total_en_activo_a_31_12 +2024,SEGOVIA,69,1,1,0,0,0,0,0,0,597,3,0,0,0,5,1,3,3,3,686.0,SE,0,152,0,140,1064 +2024,SORIA,30,1,1,0,0,0,0,0,0,346,25,0,0,0,5,0,1,5,3,417.0,SO,0,152,0,140,1064 +2024,PALENCIA,95,1,0,0,0,0,0,0,5,944,43,0,1,9,12,1,9,5,4,1129.0,PA,0,152,0,140,1064 +2024,SALAMANCA,133,2,0,0,1,0,8,0,0,2779,0,0,0,0,13,3,10,7,4,2960.0,SA,0,152,0,140,1064 +2024,BURGOS,84,3,0,0,0,2,0,0,0,2170,96,0,0,0,20,0,11,12,4,2402.0,BU,0,152,0,140,1064 +2024,ÁVILA,72,1,0,0,0,0,0,1,3,1050,29,0,0,3,5,1,2,4,4,1175.0,AV,0,152,0,140,1064 +2024,VALLADOLID,153,4,0,0,1,0,2,0,0,2915,57,0,0,0,30,5,22,13,4,3206.0,VA,0,152,0,140,1064 +2024,LEÓN,191,3,0,1,1,2,0,0,4,3412,22,1,0,20,18,2,11,10,4,3702.0,LE,0,152,0,140,1064 +2024,ZAMORA,74,1,0,0,0,1,0,0,0,1487,73,0,0,,8,1,3,6,4,1658.0,ZA,0,152,0,140,1064 +2023,ZAMORA,76,1,,0,0,1,0,0,0,1477,73,0,0,0,8,1,,,3,1640.0,ZA,0,167,0,129,1059 +2023,SORIA,28,1,,0,0,0,0,0,0,361,26,0,0,0,5,1,,,2,424.0,SO,0,167,0,129,1059 +2023,PALENCIA,96,1,,0,0,0,0,0,5,9959,43,0,1,9,12,2,,,4,10132.0,PA,0,167,0,129,1059 +2023,SALAMANCA,132,2,,0,1,0,8,0,0,2767,0,0,0,0,14,3,,,4,2931.0,SA,0,167,0,129,1059 +2023,SEGOVIA,66,1,,0,0,0,0,0,0,603,3,0,0,0,5,1,,,2,681.0,SE,0,167,0,129,1059 +2023,VALLADOLID,154,4,,0,1,0,0,0,0,2940,57,0,0,0,30,6,,,3,3195.0,VA,0,167,0,129,1059 +2023,BURGOS,82,3,,0,0,2,0,0,0,2219,98,0,0,0,20,0,,,4,2428.0,BU,0,167,0,129,1059 +2023,ÁVILA,71,1,,0,0,0,0,1,3,1088,31,0,0,3,6,1,,,3,1208.0,AV,0,167,0,129,1059 +2023,LEÓN,194,3,,1,1,2,0,0,3,3385,22,1,0,19,19,2,,,4,3656.0,LE,0,167,0,129,1059 +2022,VALLADOLID,151,4,,0,1,0,0,0,0,2964,59,0,0,0,30,6,,,3,3218.0,VA,0,155,0,84,1018 +2022,ZAMORA,76,1,,0,0,1,0,0,0,1468,76,0,0,0,8,1,,,3,1634.0,ZA,0,155,0,84,1018 +2022,SORIA,24,1,,0,0,0,0,0,0,396,28,0,0,0,5,1,,,2,457.0,SO,0,155,0,84,1018 +2022,LEÓN,193,3,,1,1,2,0,0,3,3358,22,1,0,17,19,2,,,3,3625.0,LE,0,155,0,84,1018 +2022,ÁVILA,69,1,,0,0,0,0,1,3,1128,33,0,0,3,5,1,,,3,1247.0,AV,0,155,0,84,1018 +2022,SEGOVIA,65,1,,0,0,0,0,0,0,656,3,0,0,0,5,1,,,2,733.0,SE,0,155,0,84,1018 +2022,SALAMANCA,129,2,,0,1,0,8,0,0,2752,0,0,0,0,14,4,,,3,2913.0,SA,0,155,0,84,1018 +2022,BURGOS,80,3,,0,0,2,0,0,0,2219,102,0,0,0,21,0,,,4,2431.0,BU,0,155,0,84,1018 +2022,PALENCIA,95,1,,0,0,0,0,0,4,989,47,0,1,8,13,2,,,4,1164.0,PA,0,155,0,84,1018 +2021,ÁVILA,71,1,,0,0,0,0,1,3,1109,33,0,0,3,5,1,,,3,1230.0,AV,1,108,0,57,946 +2021,SEGOVIA,62,1,,0,0,0,0,0,0,702,3,0,0,0,6,2,,,1,777.0,SE,1,108,0,57,946 +2021,LEÓN,189,3,,1,1,2,0,0,2,3328,22,1,0,17,19,2,,,3,3590.0,LE,1,108,0,57,946 +2021,BURGOS,79,4,,0,0,2,0,0,0,2289,109,0,0,0,21,0,,,4,2508.0,BU,1,108,0,57,946 +2021,ZAMORA,75,1,,0,0,1,0,0,0,1485,79,0,0,0,7,1,,,2,1651.0,ZA,1,108,0,57,946 +2021,SORIA,22,1,,0,0,0,0,0,0,452,31,0,0,0,5,1,,,1,513.0,SO,1,108,0,57,946 +2021,SALAMANCA,129,2,,0,1,0,8,0,0,2742,0,0,0,0,19,4,,,2,2907.0,SA,1,108,0,57,946 +2021,PALENCIA,93,1,,0,0,0,0,0,3,1014,49,0,1,8,13,2,,,4,1188.0,PA,1,108,0,57,946 +2021,VALLADOLID,148,5,,0,1,0,0,0,0,2984,59,0,0,0,31,6,,,3,3237.0,VA,1,108,0,57,946 +2020,PALENCIA,93,1,,0,0,0,0,0,3,1049,52,0,1,8,12,2,,,4,1298.0,PA,0,105,0,76,893 +2020,BURGOS,80,4,,0,0,2,0,0,0,2267,109,0,0,0,21,0,,,3,2559.0,BU,0,105,0,76,893 +2020,LEÓN,196,3,,1,1,2,0,0,2,3324,22,1,0,17,20,2,,,3,3758.0,LE,0,105,0,76,893 +2020,SORIA,21,1,,0,0,0,1,0,0,486,33,0,0,0,5,1,,,1,563.0,SO,0,105,0,76,893 +2020,SEGOVIA,63,1,,0,0,0,0,0,0,691,3,0,0,0,6,1,,,1,806.0,SE,0,105,0,76,893 +2020,ZAMORA,76,1,,0,0,1,0,0,0,1520,79,0,0,0,7,1,,,1,1746.0,ZA,0,105,0,76,893 +2020,VALLADOLID,148,5,,0,1,0,0,0,0,3012,60,0,0,0,31,6,,,3,3366.0,VA,0,105,0,76,893 +2020,SALAMANCA,129,2,,0,1,0,8,0,0,2733,0,0,0,0,19,4,,,2,2990.0,SA,0,105,0,76,893 +2020,ÁVILA,73,1,,0,0,0,0,1,3,1104,33,0,0,3,6,1,,,3,1280.0,AV,0,105,0,76,893 +2019,SALAMANCA,126,2,,0,1,0,8,0,0,2722,0,0,0,0,18,5,,,0,2975.0,SA,0,328,0,76,865 +2019,ZAMORA,76,1,,0,0,1,0,0,0,1515,79,0,0,0,7,1,,,0,1741.0,ZA,0,328,0,76,865 +2019,PALENCIA,92,2,,0,0,0,0,0,3,1048,52,0,1,8,11,2,,,2,1295.0,PA,0,328,0,76,865 +2019,LEÓN,196,3,,1,1,2,0,0,3,3451,23,1,0,17,17,3,,,1,3886.0,LE,0,328,0,76,865 +2019,SEGOVIA,63,1,,0,0,0,0,0,0,687,3,0,0,0,5,1,,,0,801.0,SE,0,328,0,76,865 +2019,BURGOS,77,4,,0,0,2,0,0,0,2262,109,0,0,0,19,2,,,2,2551.0,BU,0,328,0,76,865 +2019,SORIA,20,1,,0,0,0,1,0,0,489,33,0,0,0,5,1,,,1,565.0,SO,0,328,0,76,865 +2019,ÁVILA,73,1,,0,0,0,0,1,3,1095,33,0,0,3,5,1,,,1,1270.0,AV,0,328,0,76,865 +2019,VALLADOLID,148,5,,0,1,0,0,0,0,3022,60,0,0,0,32,7,,,2,3379.0,VA,0,328,0,76,865 +2018,BURGOS,78,4,,0,0,2,0,0,0,2260,113,0,0,0,16,2,,,,2550.0,BU,1,73,0,76,612 +2018,SALAMANCA,120,2,,0,1,0,8,0,0,2708,0,0,0,0,16,4,,,,2954.0,SA,1,73,0,76,612 +2018,SEGOVIA,63,1,,0,0,0,0,0,0,672,3,0,0,0,4,1,,,,785.0,SE,1,73,0,76,612 +2018,ÁVILA,75,1,,0,0,0,0,1,3,1086,33,0,0,3,5,1,,,,1264.0,AV,1,73,0,76,612 +2018,SORIA,21,1,,0,0,0,1,0,0,479,33,0,0,0,5,1,,,,555.0,SO,1,73,0,76,612 +2018,PALENCIA,90,2,,0,0,0,0,0,3,1036,52,0,1,6,7,1,,,,1273.0,PA,1,73,0,76,612 +2018,LEÓN,197,3,,1,1,2,0,0,3,3494,24,1,0,17,14,3,,,,3929.0,LE,1,73,0,76,612 +2018,ZAMORA,73,1,,0,0,1,0,0,0,1501,79,0,0,0,5,0,,,,1722.0,ZA,1,73,0,76,612 +2018,VALLADOLID,140,5,,0,1,0,0,0,0,3000,60,0,0,0,25,7,,,,3342.0,VA,1,73,0,76,612 +2017,ZAMORA,68,1,,0,0,1,0,0,0,1487,79,0,0,0,5,0,,,,1704.0,ZA,0,74,0,71,613 +2017,VALLADOLID,134,6,,0,1,0,0,0,0,2991,60,0,0,0,22,4,,,,3322.0,VA,0,74,0,71,613 +2017,SALAMANCA,117,2,,0,1,0,8,0,0,2688,0,0,0,0,11,4,,,,2928.0,SA,0,74,0,71,613 +2017,LEÓN,188,3,,1,1,2,0,0,3,3501,24,1,0,17,12,4,,,,3925.0,LE,0,74,0,71,613 +2017,ÁVILA,72,1,,0,0,0,0,1,2,1076,31,0,0,3,3,2,,,,1248.0,AV,0,74,0,71,613 +2017,BURGOS,79,4,,0,0,2,0,0,0,2244,113,0,0,0,11,1,,,,2530.0,BU,0,74,0,71,613 +2017,PALENCIA,88,2,,0,0,0,0,0,3,1022,52,0,1,4,5,1,,,,1253.0,PA,0,74,0,71,613 +2017,SEGOVIA,63,1,,0,0,0,0,0,0,654,3,0,0,0,3,1,,,,767.0,SE,0,74,0,71,613 +2017,SORIA,20,1,,0,0,0,1,0,0,473,33,0,0,0,4,1,,,,548.0,SO,0,74,0,71,613 +2016,SORIA,19,1,,0,0,0,1,0,0,471,33,0,0,0,3,1,,,,544.0,SO,0,83,0,70,637 +2016,ÁVILA,72,1,,0,0,0,0,1,2,1066,31,0,0,3,3,2,,,,1240.0,AV,0,83,0,70,637 +2016,ZAMORA,68,1,,0,0,1,0,0,0,1479,79,0,0,0,5,0,,,,1697.0,ZA,0,83,0,70,637 +2016,LEÓN,190,3,,1,1,2,0,0,3,3514,24,1,0,17,10,3,,,,3943.0,LE,0,83,0,70,637 +2016,SALAMANCA,119,2,,0,1,0,8,0,0,2675,0,0,0,0,9,3,,,,2916.0,SA,0,83,0,70,637 +2016,PALENCIA,84,2,,0,0,0,0,0,3,1010,52,0,1,3,5,1,,,,1236.0,PA,0,83,0,70,637 +2016,VALLADOLID,128,6,,0,1,0,0,0,0,2979,60,0,0,0,18,4,,,,3296.0,VA,0,83,0,70,637 +2016,BURGOS,76,4,,0,0,2,0,0,0,2233,113,0,0,0,7,1,,,,2511.0,BU,0,83,0,70,637 +2016,SEGOVIA,61,1,,0,0,0,0,0,0,660,3,0,0,0,2,2,,,,774.0,SE,0,83,0,70,637 +2015,SALAMANCA,117,2,,0,1,0,8,0,0,2661,0,0,0,0,7,0,,,,2896.0,SA,,,,,624 +2015,ÁVILA,72,1,,0,0,0,0,1,2,1064,31,0,0,3,2,0,,,,1235.0,AV,,,,,624 +2015,BURGOS,74,4,,0,0,2,0,0,0,2231,113,0,0,0,5,0,,,,2502.0,BU,,,,,624 +2015,ZAMORA,66,1,,0,0,1,0,0,0,1467,79,0,0,0,4,0,,,,1682.0,ZA,,,,,624 +2015,PALENCIA,83,2,,0,0,0,0,0,3,1022,55,0,1,3,3,1,,,,1247.0,PA,,,,,624 +2015,VALLADOLID,123,6,,0,1,0,0,0,0,2971,60,0,0,0,14,1,,,,3278.0,VA,,,,,624 +2015,LEÓN,195,3,,1,1,2,0,0,2,3566,24,1,0,18,9,1,,,,4003.0,LE,,,,,624 +2015,SEGOVIA,60,1,,0,0,0,0,0,0,670,3,0,0,0,1,0,,,,781.0,SE,,,,,624 +2015,SORIA,19,1,,0,0,0,1,0,0,465,33,0,0,0,2,0,,,,536.0,SO,,,,,624 diff --git a/Procesamiento-Datos-2/process.py b/Procesamiento-Datos-2/process.py new file mode 100644 index 0000000..b356b43 --- /dev/null +++ b/Procesamiento-Datos-2/process.py @@ -0,0 +1,120 @@ +import pandas as pd +from sqlalchemy import create_engine, text +import numpy as np + +# ------------------------------------------------------------ +# CONFIG BASE DE DATOS POSTGRES +# ------------------------------------------------------------ +DB_CONFIG = { + "host": "localhost", + "port": 5433, + "user": "postgres", + "password": "postgres", + "database": "adicciones" +} + +DB_URI = f"postgresql+psycopg2://{DB_CONFIG['user']}:{DB_CONFIG['password']}@" \ + f"{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}" + +engine = create_engine(DB_URI) + +# ------------------------------------------------------------ +# FUNCIONES AUXILIARES +# ------------------------------------------------------------ +def read_table(table_name): + """Lee una tabla desde Postgres y devuelve un DataFrame limpio.""" + with engine.begin() as conn: + df = pd.read_sql_query(text(f"SELECT * FROM {table_name};"), conn) + df.columns = df.columns.str.lower().str.strip() + print(f"📥 Tabla '{table_name}' cargada: {df.shape[0]} filas, {df.shape[1]} columnas") + return df + + +def normalize_year_columns(df): + """Renombra y limpia columnas relacionadas con año.""" + rename_map = {} + for c in df.columns: + if any(alias in c for alias in ["a_o", "ano", "anio"]): + rename_map[c] = "año" + df.rename(columns=rename_map, inplace=True) + + # eliminar duplicadas + df = df.loc[:, ~df.columns.duplicated()] + + # convertir año a numérico si es posible + if "año" in df.columns: + df["año"] = pd.to_numeric(df["año"], errors="coerce") + + return df + + +def remove_outliers(df): + """Elimina valores atípicos de columnas numéricas (IQR).""" + for col in df.select_dtypes(include=np.number).columns: + q1, q3 = df[col].quantile([0.25, 0.75]) + iqr = q3 - q1 + lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr + df = df[(df[col].between(lo, hi)) | (df[col].isna())] + return df + + +def numeric_summary(df, name): + """Devuelve estadísticas descriptivas.""" + num = df.select_dtypes(include=np.number) + if num.empty: + print(f"⚠️ '{name}' no tiene columnas numéricas.") + return pd.DataFrame() + s = num.describe().T + s["dataset"] = name + return s + + +# ------------------------------------------------------------ +# LECTURA Y LIMPIEZA +# ------------------------------------------------------------ +tables = { + "juego": "estadisticas_establecimientos_juego", + "prohibidos": "registro_prohibidos_juego", + "drogas": "consumo_drogas_alcohol_esp", + "condenas": "condenas_sexo_localidad" +} + +frames = {} +for name, table in tables.items(): + df = read_table(table) + df = normalize_year_columns(df) + df.replace(["", " ", "Desconocido", "nan", "NaN"], np.nan, inplace=True) + df = remove_outliers(df) + frames[name] = df + +# ------------------------------------------------------------ +# ANÁLISIS ESTADÍSTICO +# ------------------------------------------------------------ +summaries = [numeric_summary(df, name) for name, df in frames.items() if not df.empty] +if summaries: + pd.concat(summaries).to_csv("resumen_estadistico_por_dataset.csv") + print("📊 'resumen_estadistico_por_dataset.csv' generado correctamente.") +else: + print("⚠️ No se encontraron columnas numéricas.") + +# ------------------------------------------------------------ +# COMBINACIÓN POR 'año' +# ------------------------------------------------------------ +frames_con_año = {k: v for k, v in frames.items() if "año" in v.columns} + +print("\n📅 Tablas con columna 'año':", list(frames_con_año.keys())) + +if len(frames_con_año) > 1: + merged = None + for name, df in frames_con_año.items(): + df = df.loc[:, ~df.columns.duplicated()] # eliminar duplicadas antes del merge + if merged is None: + merged = df + else: + merged = pd.merge(merged, df, on="año", how="outer", suffixes=("", f"_{name}")) + + merged = remove_outliers(merged) + merged.to_csv("dataset_combinado_por_año.csv", index=False) + print(f"✅ 'dataset_combinado_por_año.csv' generado correctamente ({merged.shape[0]} filas, {merged.shape[1]} columnas).") +else: + print("⚠️ No hay suficientes datasets con columna 'año' para combinar.") diff --git a/Procesamiento-Datos-2/requirements.txt b/Procesamiento-Datos-2/requirements.txt new file mode 100644 index 0000000..d23df0a --- /dev/null +++ b/Procesamiento-Datos-2/requirements.txt @@ -0,0 +1,4 @@ +pandas +sqlalchemy +psycopg2-binary +numpy diff --git a/Procesamiento-Datos-2/resumen_estadistico_por_dataset.csv b/Procesamiento-Datos-2/resumen_estadistico_por_dataset.csv new file mode 100644 index 0000000..2d05855 --- /dev/null +++ b/Procesamiento-Datos-2/resumen_estadistico_por_dataset.csv @@ -0,0 +1,3 @@ +,count,mean,std,min,25%,50%,75%,max,dataset +año,90.0,2019.5,2.8883726610694618,2015.0,2017.0,2019.5,2022.0,2024.0,juego +año,10.0,2019.5,3.0276503540974917,2015.0,2017.25,2019.5,2021.75,2024.0,prohibidos