Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update projections functionality by department #270

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 44 additions & 44 deletions data/census/projection.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,71 @@
import pandas as pd
import numpy as np

import os
import zipfile

from data.spatial.department_names import DEPARTMENTS

"""
This stage loads and cleans projection data about the French population.
"""

def configure(context):
context.config("data_path")
context.config("projection_path", "projection_2021")
context.config("projection_scenario", "00_central")
context.config("projection_path", "projections/donnees_detaillees_departementales.zip")
context.config("projection_scenario", None)
context.config("projection_year", None)

context.stage("data.spatial.departments")

def execute(context):
source_path = "{}/{}/{}.xlsx".format(
df_departments = context.stage("data.spatial.departments")

# Reading data
archive_path = "{}/{}".format(
context.config("data_path"),
context.config("projection_path"),
context.config("projection_scenario"))
context.config("projection_path"))

projection_year = int(context.config("projection_year"))
projection_scenario = context.config("projection_scenario")

df_all = pd.read_excel(
source_path, sheet_name = "population", skiprows = 1).iloc[:107]

df_male = pd.read_excel(
source_path, sheet_name = "populationH", skiprows = 1).iloc[:107]

df_female = pd.read_excel(
source_path, sheet_name = "populationF", skiprows = 1).iloc[:107]

df_male["sex"] = "male"
df_female["sex"] = "female"
with zipfile.ZipFile(archive_path) as archive:
with archive.open("donnees_det_{}.xlsx".format(projection_scenario)) as f:
df = pd.read_excel(f, sheet_name = "Population", skiprows = 5)

assert df_all["Âge au 1er janvier"].iloc[-1] == "Total"
assert df_male["Âge au 1er janvier"].iloc[-1] == "Total des hommes"
assert df_female["Âge au 1er janvier"].iloc[-1] == "Total des femmes"
# Clean sex
df["sex"] = df["SEXE"].replace({ 1: "male", 2: "female" })

df_sex = pd.concat([
df_male.iloc[-1:],
df_female.iloc[-1:]
]).drop(columns = ["Âge au 1er janvier"])[["sex", projection_year]]
df_sex.columns = ["sex", "projection"]

df_age = df_all[["Âge au 1er janvier", projection_year]].iloc[:-1]
df_age.columns = ["age", "projection"]

df_male = df_male[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]
df_female = df_female[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]

df_male.columns = ["age", "sex", "projection"]
df_female.columns = ["age","sex", "projection"]
# Clean age range
df["minimum_age"] = df["TRAGE"].apply(lambda x: float(x.split(";")[0][1:]))
df["maximum_age"] = df["TRAGE"].apply(lambda x: np.inf if "+" in x else float(x.split(";")[1][:-1]))

# Clean department
lookup = { name: identifier for identifier, name in DEPARTMENTS.items() }
df["department_id"] = df["ZONE"].replace(lookup)

df_cross = pd.concat([df_male, df_female])
df_cross["sex"] = df_cross["sex"].astype("category")
requested_departments = set(df_departments["departement_id"])
available_departments = set(df["department_id"])

assert len(requested_departments - available_departments) == 0
df = df[df["department_id"].isin(df_departments["departement_id"])]

df_total = df_all.iloc[-1:].drop(columns = ["Âge au 1er janvier"])[[projection_year]]
df_total.columns = ["projection"]
# Clean weight
column = "POP_{}".format(projection_year)
if not column in df:
raise RuntimeError("Year {} is not available in projection data".format(projection_year))

df["weight"] = df[column]

return {
"total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross
}
# Cleanup
df = df[["department_id", "sex", "minimum_age", "maximum_age", "weight"]]
return df

def validate(context):
if context.config("projection_year") is not None:
source_path = "{}/{}/{}.xlsx".format(
if context.config("projection_year") is not None or context.config("projection_scenario") is not None:
source_path = "{}/{}".format(
context.config("data_path"),
context.config("projection_path"),
context.config("projection_scenario"))
context.config("projection_path"))

if not os.path.exists(source_path):
raise RuntimeError("Projection data is not available")
Expand Down
9 changes: 1 addition & 8 deletions data/census/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ def configure(context):
context.config("census_path", "rp_2019/RP2019_INDCVI_csv.zip")
context.config("census_csv", "FD_INDCVI_2019.csv")

context.config("projection_year", None)

COLUMNS_DTYPES = {
"CANTVILLE":"str",
"NUMMI":"str",
Expand All @@ -39,9 +37,6 @@ def execute(context):

requested_departements = df_codes["departement_id"].unique()

# only pre-filter if we don't need to reweight the census later
prefilter_departments = context.config("projection_year") is None

with context.progress(label = "Reading census ...") as progress:
with zipfile.ZipFile(
"{}/{}".format(context.config("data_path"), context.config("census_path"))) as archive:
Expand All @@ -54,15 +49,13 @@ def execute(context):
for df_chunk in csv:
progress.update(len(df_chunk))

if prefilter_departments:
df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]
df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]

if len(df_chunk) > 0:
df_records.append(df_chunk)

return pd.concat(df_records)


def validate(context):
if not os.path.exists("{}/{}".format(context.config("data_path"), context.config("census_path"))):
raise RuntimeError("RP 2019 data is not available")
Expand Down
103 changes: 103 additions & 0 deletions data/spatial/department_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
DEPARTMENTS = {
"01": "Ain",
"02": "Aisne",
"03": "Allier",
"04": "Alpes-de-Haute-Provence",
"05": "Hautes-Alpes",
"06": "Alpes-Maritimes",
"07": "Ardèche",
"08": "Ardennes",
"09": "Ariège",
"10": "Aube",
"11": "Aude",
"12": "Aveyron",
"13": "Bouches-du-Rhône",
"14": "Calvados",
"15": "Cantal",
"16": "Charente",
"17": "Charente-Maritime",
"18": "Cher",
"19": "Corrèze",
"2A": "Corse-du-Sud",
"2B": "Haute-Corse",
"21": "Côte-d'Or",
"22": "Côtes-d'Armor",
"23": "Creuse",
"24": "Dordogne",
"25": "Doubs",
"26": "Drôme",
"27": "Eure",
"28": "Eure-et-Loir",
"29": "Finistère",
"30": "Gard",
"31": "Haute-Garonne",
"32": "Gers",
"33": "Gironde",
"34": "Hérault",
"35": "Ille-et-Vilaine",
"36": "Indre",
"37": "Indre-et-Loire",
"38": "Isère",
"39": "Jura",
"40": "Landes",
"41": "Loir-et-Cher",
"42": "Loire",
"43": "Haute-Loire",
"44": "Loire-Atlantique",
"45": "Loiret",
"46": "Lot",
"47": "Lot-et-Garonne",
"48": "Lozère",
"49": "Maine-et-Loire",
"50": "Manche",
"51": "Marne",
"52": "Haute-Marne",
"53": "Mayenne",
"54": "Meurthe-et-Moselle",
"55": "Meuse",
"56": "Morbihan",
"57": "Moselle",
"58": "Nièvre",
"59": "Nord",
"60": "Oise",
"61": "Orne",
"62": "Pas-de-Calais",
"63": "Puy-de-Dôme",
"64": "Pyrénées-Atlantiques",
"65": "Hautes-Pyrénées",
"66": "Pyrénées-Orientales",
"67": "Bas-Rhin",
"68": "Haut-Rhin",
"69": "Rhône",
"70": "Haute-Saône",
"71": "Saône-et-Loire",
"72": "Sarthe",
"73": "Savoie",
"74": "Haute-Savoie",
"75": "Paris",
"76": "Seine-Maritime",
"77": "Seine-et-Marne",
"78": "Yvelines",
"79": "Deux-Sèvres",
"80": "Somme",
"81": "Tarn",
"82": "Tarn-et-Garonne",
"83": "Var",
"84": "Vaucluse",
"85": "Vendée",
"86": "Vienne",
"87": "Haute-Vienne",
"88": "Vosges",
"89": "Yonne",
"90": "Territoire de Belfort",
"91": "Essonne",
"92": "Hauts-de-Seine",
"93": "Seine-Saint-Denis",
"94": "Val-de-Marne",
"95": "Val-d'Oise",
"971": "Guadeloupe",
"972": "Martinique",
"973": "Guyane",
"974": "La Réunion",
"976": "Mayotte"
}
15 changes: 5 additions & 10 deletions docs/population.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,24 +307,19 @@ Running the pipeline again will add the `mode` colum to the `trips.csv` file and

The pipeline allows to make use of population projections from INSEE up to 2070. The same methodology can also be used to scale down the population. The process takes into account the marginal distribution of sex, age, their combination, and the total number of persons. The census data for the base year (see above) is reweighted according to those marginals using *Iterative Proportional Updating*.

- To make use of the scaling, [download the projection data from INSEE](https://www.insee.fr/fr/statistiques/5894093?sommaire=5760764). There are various scenarios in Excel format that you can choose from. The default is the *Scénario centrale*, the central scenario.
- Put the downloaded file into `data/projection_2021`, so you will have the file `data/projection_2021/00_central.xlsx`
- To make use of the scaling, [download the projection data from INSEE](https://www.insee.fr/fr/statistiques/7747107?sommaire=6652140). Download *Les tableaux en Excel* which contain all projection scenarios in Excel format. There are various scenarios in Excel format that you can choose from. The default is the *Scénario centrale*, the central scenario.
- Put the downloaded file into `data/projections`, so you will have the file `data/projections/donnees_detaillees_departementales.zip`

Then, activate the projection procedure by defining the projection year in the configuration:
Then, activate the projection procedure by defining the projection scenario and year in the configuration:

```yaml
config:
# [...]
projection_scenario: Central
projection_year: 2030
```

You may choose any year (past or future) that is contained in the projection scenario Excel file. In case you want to use a different scenario, download the corresponding file, put it into the folder mentioned above, and choose the scenario name via configuration:

```yaml
config:
# [...]
projection_scenario: 00_central
```
You may choose any year (past or future) that is contained in the Excel files (sheet *Population*) in the downloaded archive. The same is true for the projection scenarios, which are based on the file names and documented in the Excel files' *Documentation* sheet.

### Urban type

Expand Down
Loading
Loading