eqasim-org · sebhoerl · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/data/census/projection.py b/data/census/projection.py
@@ -1,71 +1,71 @@
 import pandas as pd
+import numpy as np
+
 import os
+import zipfile
+
+from data.spatial.department_names import DEPARTMENTS
 
 """
 This stage loads and cleans projection data about the French population.
 """
 
 def configure(context):
     context.config("data_path")
-    context.config("projection_path", "projection_2021")
-    context.config("projection_scenario", "00_central")
+    context.config("projection_path", "projections/donnees_detaillees_departementales.zip")
+    context.config("projection_scenario", None)
     context.config("projection_year", None)
 
+    context.stage("data.spatial.departments")
+
 def execute(context):
-    source_path = "{}/{}/{}.xlsx".format(
+    df_departments = context.stage("data.spatial.departments")
+
+    # Reading data
+    archive_path = "{}/{}".format(
         context.config("data_path"), 
-        context.config("projection_path"), 
-        context.config("projection_scenario"))
+        context.config("projection_path"))
 
     projection_year = int(context.config("projection_year"))
+    projection_scenario = context.config("projection_scenario")
 
-    df_all = pd.read_excel(
-        source_path, sheet_name = "population", skiprows = 1).iloc[:107]
-
-    df_male = pd.read_excel(
-        source_path, sheet_name = "populationH", skiprows = 1).iloc[:107]
-
-    df_female = pd.read_excel(
-        source_path, sheet_name = "populationF", skiprows = 1).iloc[:107]
-
-    df_male["sex"] = "male"
-    df_female["sex"] = "female"
+    with zipfile.ZipFile(archive_path) as archive:
+        with archive.open("donnees_det_{}.xlsx".format(projection_scenario)) as f:
+            df = pd.read_excel(f, sheet_name = "Population", skiprows = 5)
 
-    assert df_all["Âge au 1er janvier"].iloc[-1] == "Total"
-    assert df_male["Âge au 1er janvier"].iloc[-1] == "Total des hommes"
-    assert df_female["Âge au 1er janvier"].iloc[-1] == "Total des femmes"
+    # Clean sex
+    df["sex"] = df["SEXE"].replace({ 1: "male", 2: "female" })
 
-    df_sex = pd.concat([
-        df_male.iloc[-1:],
-        df_female.iloc[-1:]
-    ]).drop(columns = ["Âge au 1er janvier"])[["sex", projection_year]]
-    df_sex.columns = ["sex", "projection"]
-
-    df_age = df_all[["Âge au 1er janvier", projection_year]].iloc[:-1]
-    df_age.columns = ["age", "projection"]
-
-    df_male = df_male[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]
-    df_female = df_female[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]
-
-    df_male.columns = ["age", "sex", "projection"]
-    df_female.columns = ["age","sex", "projection"]
+    # Clean age range
+    df["minimum_age"] = df["TRAGE"].apply(lambda x: float(x.split(";")[0][1:]))
+    df["maximum_age"] = df["TRAGE"].apply(lambda x: np.inf if "+" in x else float(x.split(";")[1][:-1]))
+
+    # Clean department
+    lookup = { name: identifier for identifier, name in DEPARTMENTS.items() }
+    df["department_id"] = df["ZONE"].replace(lookup)
 
-    df_cross = pd.concat([df_male, df_female])
-    df_cross["sex"] = df_cross["sex"].astype("category")
+    requested_departments = set(df_departments["departement_id"])
+    available_departments = set(df["department_id"])
+
+    assert len(requested_departments - available_departments) == 0
+    df = df[df["department_id"].isin(df_departments["departement_id"])]
 
-    df_total = df_all.iloc[-1:].drop(columns = ["Âge au 1er janvier"])[[projection_year]]
-    df_total.columns = ["projection"]
+    # Clean weight
+    column = "POP_{}".format(projection_year)
+    if not column in df:
+        raise RuntimeError("Year {} is not available in projection data".format(projection_year))
+
+    df["weight"] = df[column]
 
-    return {
-        "total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross
-    }
+    # Cleanup
+    df = df[["department_id", "sex", "minimum_age", "maximum_age", "weight"]] 
+    return df
 
 def validate(context):
-    if context.config("projection_year") is not None:
-        source_path = "{}/{}/{}.xlsx".format(
+    if context.config("projection_year") is not None or context.config("projection_scenario") is not None:
+        source_path = "{}/{}".format(
             context.config("data_path"), 
-            context.config("projection_path"), 
-            context.config("projection_scenario"))
+            context.config("projection_path"))
 
         if not os.path.exists(source_path):
             raise RuntimeError("Projection data is not available")

diff --git a/data/census/raw.py b/data/census/raw.py
@@ -13,8 +13,6 @@ def configure(context):
     context.config("census_path", "rp_2019/RP2019_INDCVI_csv.zip")
     context.config("census_csv", "FD_INDCVI_2019.csv")
 
-    context.config("projection_year", None)
-
 COLUMNS_DTYPES = {
     "CANTVILLE":"str", 
     "NUMMI":"str", 
@@ -39,9 +37,6 @@ def execute(context):
 
     requested_departements = df_codes["departement_id"].unique()
 
-    # only pre-filter if we don't need to reweight the census later
-    prefilter_departments = context.config("projection_year") is None
-
     with context.progress(label = "Reading census ...") as progress:
         with zipfile.ZipFile(
             "{}/{}".format(context.config("data_path"), context.config("census_path"))) as archive:
@@ -54,15 +49,13 @@ def execute(context):
                 for df_chunk in csv:
                     progress.update(len(df_chunk))
 
-                    if prefilter_departments:
-                        df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]
+                    df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]
 
                     if len(df_chunk) > 0:
                         df_records.append(df_chunk)
 
     return pd.concat(df_records)
 
-
 def validate(context):
     if not os.path.exists("{}/{}".format(context.config("data_path"), context.config("census_path"))):
         raise RuntimeError("RP 2019 data is not available")

diff --git a/data/spatial/department_names.py b/data/spatial/department_names.py
@@ -0,0 +1,103 @@
+DEPARTMENTS = {
+    "01": "Ain",
+    "02": "Aisne",
+    "03": "Allier",
+    "04": "Alpes-de-Haute-Provence",
+    "05": "Hautes-Alpes",
+    "06": "Alpes-Maritimes",
+    "07": "Ardèche",
+    "08": "Ardennes",
+    "09": "Ariège",
+    "10": "Aube",
+    "11": "Aude",
+    "12": "Aveyron",
+    "13": "Bouches-du-Rhône",
+    "14": "Calvados",
+    "15": "Cantal",
+    "16": "Charente",
+    "17": "Charente-Maritime",
+    "18": "Cher",
+    "19": "Corrèze",
+    "2A": "Corse-du-Sud",
+    "2B": "Haute-Corse",
+    "21": "Côte-d'Or",
+    "22": "Côtes-d'Armor",
+    "23": "Creuse",
+    "24": "Dordogne",
+    "25": "Doubs",
+    "26": "Drôme",
+    "27": "Eure",
+    "28": "Eure-et-Loir",
+    "29": "Finistère",
+    "30": "Gard",
+    "31": "Haute-Garonne",
+    "32": "Gers",
+    "33": "Gironde",
+    "34": "Hérault",
+    "35": "Ille-et-Vilaine",
+    "36": "Indre",
+    "37": "Indre-et-Loire",
+    "38": "Isère",
+    "39": "Jura",
+    "40": "Landes",
+    "41": "Loir-et-Cher",
+    "42": "Loire",
+    "43": "Haute-Loire",
+    "44": "Loire-Atlantique",
+    "45": "Loiret",
+    "46": "Lot",
+    "47": "Lot-et-Garonne",
+    "48": "Lozère",
+    "49": "Maine-et-Loire",
+    "50": "Manche",
+    "51": "Marne",
+    "52": "Haute-Marne",
+    "53": "Mayenne",
+    "54": "Meurthe-et-Moselle",
+    "55": "Meuse",
+    "56": "Morbihan",
+    "57": "Moselle",
+    "58": "Nièvre",
+    "59": "Nord",
+    "60": "Oise",
+    "61": "Orne",
+    "62": "Pas-de-Calais",
+    "63": "Puy-de-Dôme",
+    "64": "Pyrénées-Atlantiques",
+    "65": "Hautes-Pyrénées",
+    "66": "Pyrénées-Orientales",
+    "67": "Bas-Rhin",
+    "68": "Haut-Rhin",
+    "69": "Rhône",
+    "70": "Haute-Saône",
+    "71": "Saône-et-Loire",
+    "72": "Sarthe",
+    "73": "Savoie",
+    "74": "Haute-Savoie",
+    "75": "Paris",
+    "76": "Seine-Maritime",
+    "77": "Seine-et-Marne",
+    "78": "Yvelines",
+    "79": "Deux-Sèvres",
+    "80": "Somme",
+    "81": "Tarn",
+    "82": "Tarn-et-Garonne",
+    "83": "Var",
+    "84": "Vaucluse",
+    "85": "Vendée",
+    "86": "Vienne",
+    "87": "Haute-Vienne",
+    "88": "Vosges",
+    "89": "Yonne",
+    "90": "Territoire de Belfort",
+    "91": "Essonne",
+    "92": "Hauts-de-Seine",
+    "93": "Seine-Saint-Denis",
+    "94": "Val-de-Marne",
+    "95": "Val-d'Oise",
+    "971": "Guadeloupe",
+    "972": "Martinique",
+    "973": "Guyane",
+    "974": "La Réunion",
+    "976": "Mayotte"
+}
diff --git a/docs/population.md b/docs/population.md
@@ -307,24 +307,19 @@ Running the pipeline again will add the `mode` colum to the `trips.csv` file and
 
 The pipeline allows to make use of population projections from INSEE up to 2070. The same methodology can also be used to scale down the population. The process takes into account the marginal distribution of sex, age, their combination, and the total number of persons. The census data for the base year (see above) is reweighted according to those marginals using *Iterative Proportional Updating*.
 
-- To make use of the scaling, [download the projection data from INSEE](https://www.insee.fr/fr/statistiques/5894093?sommaire=5760764). There are various scenarios in Excel format that you can choose from. The default is the *Scénario centrale*, the central scenario. 
-- Put the downloaded file into `data/projection_2021`, so you will have the file `data/projection_2021/00_central.xlsx`
+- To make use of the scaling, [download the projection data from INSEE](https://www.insee.fr/fr/statistiques/7747107?sommaire=6652140). Download *Les tableaux en Excel* which contain all projection scenarios in Excel format.  There are various scenarios in Excel format that you can choose from. The default is the *Scénario centrale*, the central scenario. 
+- Put the downloaded file into `data/projections`, so you will have the file `data/projections/donnees_detaillees_departementales.zip`
 
-Then, activate the projection procedure by defining the projection year in the configuration:
+Then, activate the projection procedure by defining the projection scenario and year in the configuration:
 
 ```yaml
 config: 
   # [...]
+  projection_scenario: Central
   projection_year: 2030
 ```
 
-You may choose any year (past or future) that is contained in the projection scenario Excel file. In case you want to use a different scenario, download the corresponding file, put it into the folder mentioned above, and choose the scenario name via configuration:
-
-```yaml
-config: 
-  # [...]
-  projection_scenario: 00_central
-```
+You may choose any year (past or future) that is contained in the Excel files (sheet *Population*) in the downloaded archive. The same is true for the projection scenarios, which are based on the file names and documented in the Excel files' *Documentation* sheet.
 
 ### Urban type