TP4 : Régression linéaire multiple, auto-corrélation spatiale et GWR¶
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
Données¶
Chargement des contours des pays¶
countries = gpd.read_file("countries.geojson")
countries.plot()
<Axes: >
countries
| name | ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | |
|---|---|---|---|---|
| 0 | Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... |
| 1 | Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... |
| 2 | Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... |
| 3 | Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... |
| 4 | Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
| ... | ... | ... | ... | ... |
| 253 | Macao S.A.R | MAC | MO | MULTIPOLYGON (((113.5586 22.16303, 113.56943 2... |
| 254 | Ashmore and Cartier Islands | -99 | -99 | POLYGON ((123.59702 -12.42832, 123.59775 -12.4... |
| 255 | Bajo Nuevo Bank (Petrel Is.) | -99 | -99 | POLYGON ((-79.98929 15.79495, -79.98782 15.796... |
| 256 | Serranilla Bank | -99 | -99 | POLYGON ((-78.63707 15.86209, -78.64041 15.864... |
| 257 | Scarborough Reef | -99 | -99 | POLYGON ((117.75389 15.15437, 117.75569 15.151... |
258 rows × 4 columns
countries = countries.set_index("name")
countries = countries.rename(index={
"United States of America": "United States",
"Democratic Republic of the Congo": "Democratic Republic of Congo",
"Ivory Coast": "Cote d\'Ivoire",
"Republic of the Congo": "Congo",
"Czech Republic": "Czechia",
"The Bahamas": "Bahamas",
"Guinea Bissau": "Guinea-Bissau",
"Federated States of Micronesia": "Micronesia (country)",
"Macedonia": "North Macedonia",
"eSwatini": "Eswatini",
"Republic of Serbia": "Serbia",
"United Republic of Tanzania": "Tanzania",
"São Tomé and Principe": "Sao Tome and Principe",
"Cape Verde": "Cabo Verde"
}
)
Chargement des données¶
Cas 1: On a téléchargé des données pour une seule date¶
life_expectancy = pd.read_csv("life-expectancy.csv")
life_expectancy.head()
| Entity | Code | Year | Period life expectancy at birth | time | |
|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 2023 | 66.0346 | 2023 |
| 1 | Albania | ALB | 2023 | 79.6019 | 2023 |
| 2 | Algeria | DZA | 2023 | 76.2610 | 2023 |
| 3 | Andorra | AND | 2023 | 84.0406 | 2023 |
| 4 | Angola | AGO | 2023 | 64.6170 | 2023 |
life_expectancy = life_expectancy.set_index("Entity")
life_expectancy.head()
| Code | Year | Period life expectancy at birth | time | |
|---|---|---|---|---|
| Entity | ||||
| Afghanistan | AFG | 2023 | 66.0346 | 2023 |
| Albania | ALB | 2023 | 79.6019 | 2023 |
| Algeria | DZA | 2023 | 76.2610 | 2023 |
| Andorra | AND | 2023 | 84.0406 | 2023 |
| Angola | AGO | 2023 | 64.6170 | 2023 |
countries["life_expectancy"]=np.arange(len(countries))
countries.head()
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | |
|---|---|---|---|---|
| name | ||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 0 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 1 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 2 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 3 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 4 |
life_expectancy["Period life expectancy at birth"]
| Period life expectancy at birth | |
|---|---|
| Entity | |
| Afghanistan | 66.0346 |
| Albania | 79.6019 |
| Algeria | 76.2610 |
| Andorra | 84.0406 |
| Angola | 64.6170 |
| ... | ... |
| Vietnam | 74.5883 |
| Western Sahara | 71.3850 |
| Yemen | 69.2952 |
| Zambia | 66.3487 |
| Zimbabwe | 62.7748 |
201 rows × 1 columns
countries["life_expectancy"]=life_expectancy["Period life expectancy at birth"]
countries.head()
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | |
|---|---|---|---|---|
| name | ||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 71.1457 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 76.6571 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 81.1667 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 68.5814 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 77.7401 |
countries.plot(column="life_expectancy", figsize=(12,6), legend=True).set_title("life expectancy")
Text(0.5, 1.0, 'life expectancy')
On fait la même chose pour les autres variables étudiées
countries["population"] = pd.read_csv("population.csv").set_index("Entity")["Population (historical)"]
countries.plot(column="population", figsize=(12,6), legend=True).set_title("population")
Text(0.5, 1.0, 'population')
countries["log_population"] = np.log(countries["population"])
countries["co2_emissions"] = pd.read_csv("co-emissions-per-capita.csv").set_index("Entity")["Annual CO₂ emissions (per capita)"]
countries.plot(column="co2_emissions", figsize=(12,6), legend=True).set_title("Emissions annuelles de CO₂ par habitant")
Text(0.5, 1.0, 'Emissions annuelles de CO₂ par habitant')
countries["child_mortality"] = pd.read_csv("child-mortality.csv").set_index("Entity")["Child mortality rate"]
countries.plot(column="child_mortality", figsize=(12,6), legend=True).set_title("Taux de mortalité infantile")
Text(0.5, 1.0, 'Taux de mortalité infantile')
countries["log_child_mortality"] = np.log(1+countries["child_mortality"])
Cas 2 : on a téléchargé la série temporelle¶
hdi = pd.read_csv("human-development-index.csv")
hdi.head()
| Entity | Code | Year | Human Development Index | World regions according to OWID | |
|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 1990 | 0.285 | NaN |
| 1 | Afghanistan | AFG | 1991 | 0.291 | NaN |
| 2 | Afghanistan | AFG | 1992 | 0.301 | NaN |
| 3 | Afghanistan | AFG | 1993 | 0.311 | NaN |
| 4 | Afghanistan | AFG | 1994 | 0.305 | NaN |
hdi_by_year = hdi.pivot_table(index='Entity', columns='Year', values='Human Development Index')
hdi_by_year.head()
| Year | 1990 | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | ... | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Entity | |||||||||||||||||||||
| Afghanistan | 0.285 | 0.291 | 0.301 | 0.311 | 0.305 | 0.329 | 0.334 | 0.338 | 0.338 | 0.347 | ... | 0.497000 | 0.49600 | 0.495000 | 0.4960 | 0.498000 | 0.507000 | 0.501000 | 0.486000 | 0.495000 | 0.496000 |
| Africa | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.536723 | 0.54259 | 0.546639 | 0.5504 | 0.555833 | 0.560354 | 0.559465 | 0.561104 | 0.571485 | 0.576059 |
| Albania | 0.654 | 0.638 | 0.622 | 0.624 | 0.629 | 0.638 | 0.647 | 0.645 | 0.659 | 0.671 | ... | 0.797000 | 0.79700 | 0.797000 | 0.7980 | 0.801000 | 0.805000 | 0.794000 | 0.794000 | 0.806000 | 0.810000 |
| Algeria | 0.595 | 0.596 | 0.601 | 0.603 | 0.603 | 0.608 | 0.615 | 0.624 | 0.634 | 0.643 | ... | 0.732000 | 0.73700 | 0.743000 | 0.7460 | 0.749000 | 0.753000 | 0.742000 | 0.755000 | 0.761000 | 0.763000 |
| Andorra | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.866000 | 0.86900 | 0.872000 | 0.8730 | 0.875000 | 0.876000 | 0.851000 | 0.871000 | 0.893000 | 0.913000 |
5 rows × 34 columns
def function_to_rename_columns_name(column_name):
return f"hdi_{column_name}"
hdi_by_year = hdi_by_year.rename(columns=function_to_rename_columns_name)
hdi_by_year.head()
| Year | hdi_1990 | hdi_1991 | hdi_1992 | hdi_1993 | hdi_1994 | hdi_1995 | hdi_1996 | hdi_1997 | hdi_1998 | hdi_1999 | ... | hdi_2014 | hdi_2015 | hdi_2016 | hdi_2017 | hdi_2018 | hdi_2019 | hdi_2020 | hdi_2021 | hdi_2022 | hdi_2023 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Entity | |||||||||||||||||||||
| Afghanistan | 0.285 | 0.291 | 0.301 | 0.311 | 0.305 | 0.329 | 0.334 | 0.338 | 0.338 | 0.347 | ... | 0.497000 | 0.49600 | 0.495000 | 0.4960 | 0.498000 | 0.507000 | 0.501000 | 0.486000 | 0.495000 | 0.496000 |
| Africa | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.536723 | 0.54259 | 0.546639 | 0.5504 | 0.555833 | 0.560354 | 0.559465 | 0.561104 | 0.571485 | 0.576059 |
| Albania | 0.654 | 0.638 | 0.622 | 0.624 | 0.629 | 0.638 | 0.647 | 0.645 | 0.659 | 0.671 | ... | 0.797000 | 0.79700 | 0.797000 | 0.7980 | 0.801000 | 0.805000 | 0.794000 | 0.794000 | 0.806000 | 0.810000 |
| Algeria | 0.595 | 0.596 | 0.601 | 0.603 | 0.603 | 0.608 | 0.615 | 0.624 | 0.634 | 0.643 | ... | 0.732000 | 0.73700 | 0.743000 | 0.7460 | 0.749000 | 0.753000 | 0.742000 | 0.755000 | 0.761000 | 0.763000 |
| Andorra | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.866000 | 0.86900 | 0.872000 | 0.8730 | 0.875000 | 0.876000 | 0.851000 | 0.871000 | 0.893000 | 0.913000 |
5 rows × 34 columns
countries[["hdi_2023", "hdi_2013"]]=hdi_by_year[["hdi_2023", "hdi_2013"]]
countries
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | population | log_population | co2_emissions | child_mortality | log_child_mortality | hdi_2023 | hdi_2013 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| name | |||||||||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 71.1457 | 281190068.0 | 19.454541 | 2.711182 | 2.06 | 1.118415 | 0.728 | 0.690 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 76.6571 | 35126295.0 | 17.374461 | 7.865810 | 0.81 | 0.593327 | 0.819 | 0.791 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 81.1667 | 19658833.0 | 16.794037 | 3.947550 | 0.72 | 0.542324 | 0.878 | 0.845 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 68.5814 | 12244161.0 | 16.320560 | 2.010797 | 2.31 | 1.196948 | 0.733 | 0.699 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 77.7401 | 33845616.0 | 17.337320 | 1.951944 | 1.58 | 0.947789 | 0.794 | 0.757 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Macao S.A.R | MAC | MO | MULTIPOLYGON (((113.5586 22.16303, 113.56943 2... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Ashmore and Cartier Islands | -99 | -99 | POLYGON ((123.59702 -12.42832, 123.59775 -12.4... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Bajo Nuevo Bank (Petrel Is.) | -99 | -99 | POLYGON ((-79.98929 15.79495, -79.98782 15.796... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Serranilla Bank | -99 | -99 | POLYGON ((-78.63707 15.86209, -78.64041 15.864... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Scarborough Reef | -99 | -99 | POLYGON ((117.75389 15.15437, 117.75569 15.151... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
258 rows × 11 columns
countries.plot(column='hdi_2023', figsize=(12,6), legend=True).set_title("Human Development Index (2023)")
Text(0.5, 1.0, 'Human Development Index (2023)')
Gestion des valeurs NaN¶
La plupart des méthodes d'analyse statistiques ne supportent pas les NaN, par exemple l'ACP.
Si l'analyse de ces unités spatiales est importante, on peut leur attribuer une valeur, par exemple la moyenne pour chaque variable. Si les variables avec un NaN sont corrélées avec une variable pour laquelle la valeur est connue, on peut inférer avec une régression linéaire.
Dans notre cas, on va simplement ignorer les individus avec des NaN, avec la méthode dropna().
countries = countries.dropna()
countries
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | population | log_population | co2_emissions | child_mortality | log_child_mortality | hdi_2023 | hdi_2013 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| name | |||||||||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 71.1457 | 281190068.0 | 19.454541 | 2.711182 | 2.06 | 1.118415 | 0.728 | 0.690 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 76.6571 | 35126295.0 | 17.374461 | 7.865810 | 0.81 | 0.593327 | 0.819 | 0.791 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 81.1667 | 19658833.0 | 16.794037 | 3.947550 | 0.72 | 0.542324 | 0.878 | 0.845 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 68.5814 | 12244161.0 | 16.320560 | 2.010797 | 2.31 | 1.196948 | 0.733 | 0.699 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 77.7401 | 33845616.0 | 17.337320 | 1.951944 | 1.58 | 0.947789 | 0.794 | 0.757 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Nauru | NRU | NR | POLYGON ((166.93881 -0.49041, 166.95558 -0.497... | 62.1094 | 11900.0 | 9.384294 | 4.933193 | 0.89 | 0.636577 | 0.703 | 0.640 |
| Micronesia (country) | FSM | FM | MULTIPOLYGON (((163.02605 5.34089, 163.03045 5... | 67.1983 | 112646.0 | 11.632005 | 1.278669 | 2.31 | 1.196948 | 0.615 | 0.603 |
| Vanuatu | VUT | VU | MULTIPOLYGON (((169.84034 -20.1408, 169.86052 ... | 71.4769 | 320422.0 | 12.677394 | 0.588156 | 1.68 | 0.985817 | 0.621 | 0.592 |
| Palau | PLW | PW | MULTIPOLYGON (((134.2715 7.07453, 134.27931 7.... | 69.2690 | 17751.0 | 9.784197 | 12.180102 | 2.23 | 1.172482 | 0.786 | 0.791 |
| Bahrain | BHR | BH | MULTIPOLYGON (((50.55161 26.19424, 50.59474 26... | 81.2835 | 1569674.0 | 14.266379 | 24.710030 | 0.86 | 0.620576 | 0.899 | 0.846 |
188 rows × 11 columns
pd.plotting.scatter_matrix(countries[["population", "log_population", "life_expectancy", "child_mortality", "log_child_mortality", "hdi_2023", "co2_emissions"]], figsize=(15, 15))
Text(0.5, 1.0, 'Scatter Matrix')
2) Régression linéaire multiple¶
On va essayer de prédire les emissions de CO2 du pays en fonctions des autres variables
import statsmodels.api as sm
explicative_vars = ["population", "life_expectancy", "log_child_mortality", "hdi_2023"]
target = "co2_emissions"
X = sm.add_constant(countries[explicative_vars])
y = countries[target]
#Ordinary Least Square
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: co2_emissions R-squared: 0.303
Model: OLS Adj. R-squared: 0.288
Method: Least Squares F-statistic: 19.86
Date: Thu, 05 Mar 2026 Prob (F-statistic): 1.34e-13
Time: 13:23:47 Log-Likelihood: -554.61
No. Observations: 188 AIC: 1119.
Df Residuals: 183 BIC: 1135.
Df Model: 4
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const -23.6306 10.692 -2.210 0.028 -44.726 -2.535
population 5.968e-10 2.25e-09 0.265 0.791 -3.84e-09 5.03e-09
life_expectancy 0.0883 0.136 0.647 0.518 -0.181 0.357
log_child_mortality 2.4131 1.623 1.487 0.139 -0.789 5.615
hdi_2023 25.6472 6.123 4.188 0.000 13.566 37.729
==============================================================================
Omnibus: 171.228 Durbin-Watson: 1.660
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2597.328
Skew: 3.515 Prob(JB): 0.00
Kurtosis: 19.797 Cond. No. 5.02e+09
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.02e+09. This might indicate that there are
strong multicollinearity or other numerical problems.
Ici, on voit que l'indicateur P>|t| n'est pas petit (est supérieur à 0.05) pour plusieurs variables : log_population, life_expectancy, log_child_mortality. L'intervalle de confiance a 95% pour ces variables contient à la fois des valeurs positives et négatives donc le sens de la relation est incertain. On en conclut qu'il ne faut pas utiliser ces variables.
y_pred = results.predict(X)
countries["predicted_co2_emissions"] = y_pred
countries["residus"] = results.resid
/usr/local/lib/python3.12/dist-packages/geopandas/geodataframe.py:1969: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
#calcul des valeurs min et max pour avoir une échelle commune
vmin = min(countries["co2_emissions"].min(), countries["predicted_co2_emissions"].min())
vmax = max(countries["co2_emissions"].max(), countries["predicted_co2_emissions"].max())
countries.plot(column="co2_emissions", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant")
countries.plot(column="predicted_co2_emissions", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant prédites")
countries.plot(column="residus", figsize=(12,6), legend=True, cmap="coolwarm").set_title("Résidus")
Text(0.5, 1.0, 'Résidus')
- On peut voir que globalement les résidus ont l'air d'être autocorrélés spatiallement.
- Le Qatar a une valeur de résidu particulièrement élevée (émissions sous-estimées)
On va essayer en enlevant les pays "outliers"
import statsmodels.api as sm
explicative_vars = ["population", "life_expectancy", "log_child_mortality", "hdi_2023"]
target = "co2_emissions"
countries_without_high_residus = countries[countries["residus"]<15]
X2 = sm.add_constant(countries_without_high_residus[explicative_vars])
y2 = countries_without_high_residus[target]
#Ordinary Least Square
model2 = sm.OLS(y2, X2)
results2 = model2.fit()
print(results2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: co2_emissions R-squared: 0.455
Model: OLS Adj. R-squared: 0.443
Method: Least Squares F-statistic: 37.21
Date: Thu, 05 Mar 2026 Prob (F-statistic): 1.34e-22
Time: 13:24:03 Log-Likelihood: -448.35
No. Observations: 183 AIC: 906.7
Df Residuals: 178 BIC: 922.7
Df Model: 4
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const -9.3549 6.582 -1.421 0.157 -22.345 3.635
population 1.747e-09 1.37e-09 1.279 0.203 -9.49e-10 4.44e-09
life_expectancy -0.0568 0.084 -0.678 0.499 -0.222 0.109
log_child_mortality 0.7364 0.994 0.741 0.460 -1.225 2.698
hdi_2023 22.1422 3.736 5.927 0.000 14.770 29.514
==============================================================================
Omnibus: 101.890 Durbin-Watson: 1.847
Prob(Omnibus): 0.000 Jarque-Bera (JB): 456.080
Skew: 2.217 Prob(JB): 9.19e-100
Kurtosis: 9.336 Cond. No. 5.09e+09
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.09e+09. This might indicate that there are
strong multicollinearity or other numerical problems.
On voit qu'on a augmenté le R² : le modèle est mieux adapté sans ces valeurs extrêmes
y_pred2 = results.predict(X)
countries["predicted_co2_emissions2"] = y_pred2
countries["residus2"] = countries["co2_emissions"] - countries["predicted_co2_emissions2"]
/usr/local/lib/python3.12/dist-packages/geopandas/geodataframe.py:1969: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
#calcul des valeurs min et max pour avoir une échelle commune
vmin = min(countries["co2_emissions"].min(), countries["predicted_co2_emissions2"].min())
vmax = max(countries["co2_emissions"].max(), countries["predicted_co2_emissions2"].max())
countries.plot(column="co2_emissions", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant")
countries.plot(column="predicted_co2_emissions2", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant prédites")
countries.plot(column="residus2", figsize=(12,6), legend=True, cmap="coolwarm").set_title("Résidus")
Text(0.5, 1.0, 'Résidus')
Globalement, les valeurs absolues des résidus ont diminué pour tous les pays, sauf pour ceux qu'on a exclu pour lesquels les résidus ont augmenté
Régression en enlevant les variables non significatives
import statsmodels.api as sm
explicative_vars = ["hdi_2023"]
target = "co2_emissions"
countries_without_high_residus = countries[countries["residus"]<15]
X3 = sm.add_constant(countries_without_high_residus[explicative_vars])
y3 = countries_without_high_residus[target]
#Ordinary Least Square
model3 = sm.OLS(y2, X3)
results3 = model3.fit()
print(results3.summary())
OLS Regression Results
==============================================================================
Dep. Variable: co2_emissions R-squared: 0.444
Model: OLS Adj. R-squared: 0.441
Method: Least Squares F-statistic: 144.4
Date: Thu, 05 Mar 2026 Prob (F-statistic): 7.71e-25
Time: 13:22:48 Log-Likelihood: -450.27
No. Observations: 183 AIC: 904.5
Df Residuals: 181 BIC: 911.0
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -8.7836 1.057 -8.308 0.000 -10.870 -6.697
hdi_2023 16.9033 1.406 12.018 0.000 14.128 19.678
==============================================================================
Omnibus: 102.644 Durbin-Watson: 1.807
Prob(Omnibus): 0.000 Jarque-Bera (JB): 456.434
Skew: 2.243 Prob(JB): 7.70e-100
Kurtosis: 9.304 Cond. No. 10.4
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Les résultats sont légèrement moins bons (R² ajusté plus faible)
3) Autocorrélation spatiale¶
W = countries.geometry.apply(lambda geom: countries.geometry.touches(geom)).astype(int)
W
| name | Indonesia | Malaysia | Chile | Bolivia | Peru | Argentina | Cyprus | India | China | Israel | ... | Tonga | Samoa | Solomon Islands | Tuvalu | Maldives | Nauru | Micronesia (country) | Vanuatu | Palau | Bahrain |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| name | |||||||||||||||||||||
| Indonesia | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Malaysia | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Chile | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Bolivia | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Peru | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Nauru | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Micronesia (country) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Vanuatu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Palau | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Bahrain | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
188 rows × 188 columns
Qui sont les voisins de la France ?
voisins_france = W.loc["France"][W.loc["France"]==1]
voisins_france
| France | |
|---|---|
| name | |
| Suriname | 1 |
| Brazil | 1 |
| Germany | 1 |
| Luxembourg | 1 |
| Belgium | 1 |
| Spain | 1 |
| Italy | 1 |
| Switzerland | 1 |
| Andorra | 1 |
Les valeurs de la variable sont-elles similaires pour la France et ses voisins ?
print(countries.loc["France"][target])
print(countries.loc[voisins_france.index][target])
4.067842 name Suriname 4.561335 Brazil 2.292272 Germany 7.022808 Luxembourg 10.285255 Belgium 7.231636 Spain 4.497049 Italy 5.248631 Switzerland 3.604899 Andorra 5.170065 Name: co2_emissions, dtype: float64
Wtilde = W/W.sum(axis=0)
x = countries[target]
xc = x - x.mean()
Wtilde = Wtilde.fillna(0)
Diagramme de Moran
| co2_emissions | |
|---|---|
| name | |
| Indonesia | -1.610972 |
| Malaysia | 3.543656 |
| Chile | -0.374605 |
| Bolivia | -2.311358 |
| Peru | -2.370211 |
| ... | ... |
| Nauru | 0.611038 |
| Micronesia (country) | -3.043485 |
| Vanuatu | -3.733999 |
| Palau | 7.857947 |
| Bahrain | 20.387875 |
188 rows × 1 columns
plt.scatter(xc, Wtilde @ xc)
plt.xlabel("xc")
plt.ylabel("Wtilde @ xc")
plt.axvline(0)
plt.axhline(0)
#Afficher le point de la France
plt.plot(xc.loc["France"], (Wtilde @ xc).loc["France"], "or")
plt.text(xc.loc["France"], (Wtilde @ xc).loc["France"], "France")
Text(-0.25431276723404306, 3.411865224021276, 'France')
Les emissions de CO2 de la France sont légèrement en dessous de la moyenne mondiale, tandis que celles de ces voisins sont au dessus de la moyenne mondiale
Ii = xc * (Wtilde @ xc) / x.var()
countries["$I_i$ "+target] = Ii
ax = countries[countries["$I_i$ "+target]>0].plot(column="$I_i$ "+target, figsize=(18,6), legend=True, cmap="Reds")
countries[countries["$I_i$ "+target]<=0].plot(column="$I_i$ "+target, figsize=(18,6), legend=True, cmap="Blues_r", ax=ax)
ax.set_title("$I_i$ "+target)
/usr/local/lib/python3.12/dist-packages/geopandas/geodataframe.py:1969: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
Text(0.5, 1.0, '$I_i$ co2_emissions')
I = Ii.sum()/Wtilde.sum().sum()
I
np.float64(0.48546924209009396)
Ici, I est supérieur à 0 : globalement les émissions de CO2 d'un pays ressemblent à celles de ces voisins, mais ce n'est pas toujours le cas
def compute_I(var):
x = countries[var]
xc = x - x.mean()
Ii = xc * (Wtilde @ xc) / x.var()
ax = countries[Ii>0].plot(column=Ii[Ii>0], figsize=(18,6), legend=True, cmap="Reds")
countries[Ii<=0].plot(ax=ax, column=Ii[Ii<=0], legend=True, cmap="Blues_r").set_title("I_i "+ var)
I = Ii.sum()/Wtilde.sum().sum()
return I
for var in explicative_vars:
print(var, compute_I(var))
print("co2_emissions", compute_I(target))
print("residus", compute_I("residus"))
hdi_2023 0.810427511484669 co2_emissions 0.48546924209009396 residus 0.4365841553095059
On peut constater que l'IDH est très auto-corrélé spatialement (I proche de 1)
Les résidus ont une auto-corrélation non nulle : cela justifie le fait d'utiliser une GWR
4) GWR¶
def distances(pays_cible):
R = 6371000
#On calcule les centroides en coordonnées projetées
pays_cible = countries.to_crs(epsg="3857").loc[pays_cible]
centroid_cible = pays_cible.geometry.centroid
centroid_cible = gpd.GeoSeries([centroid_cible], crs="EPSG:3857").to_crs(epsg="4326")#On retourne sur les coordonnées lat lon
centroids = countries.to_crs(epsg="3857").geometry.centroid.to_crs(epsg="4326")
#On calcule les distances avec la formule de Haversine
lat1 = centroid_cible.y * np.pi/180
lon1 = centroid_cible.x * np.pi/180
lat2 = centroids.y * np.pi/180
lon2 = centroids.x * np.pi/180
delta_lat = lat1.values - lat2.values
delta_lon = lon1.values - lon2.values
a = np.sin(delta_lat/2)**2 + np.cos(lat1.values) * np.cos(lat2.values) * np.sin(delta_lon/2)**2
distances = 2*R*np.arcsin(np.sqrt(a))
return pd.Series(distances, index=centroids.index)
distances("France").sort_values()
| 0 | |
|---|---|
| name | |
| France | 0.000000e+00 |
| Andorra | 2.887777e+05 |
| Spain | 4.692477e+05 |
| Portugal | 7.890318e+05 |
| Switzerland | 7.917678e+05 |
| ... | ... |
| Samoa | 1.652552e+07 |
| Fiji | 1.671480e+07 |
| Vanuatu | 1.673755e+07 |
| Tonga | 1.727393e+07 |
| New Zealand | 1.936601e+07 |
188 rows × 1 columns
#fonction Kernel
def K(x):
return np.exp(-0.5*x**2)
def w(pays_1, pays_2, h):
distances_pays1 = distances(pays_1)
d = distances_pays1.loc[pays_2]
return K(d/h)
print(w("France", "Spain", 1000000))#h = 1000km
print(w("France", "Chile", 1000000))
0.895747518969046 1.38397335406039e-30
def Wi(pays_cible, h):
distances_pays_cible = distances(pays_cible)
wik = K(distances_pays_cible/h)
return pd.DataFrame(np.diag(wik), index=wik.index, columns=wik.index)
Wi("France", 1000000)
| name | Indonesia | Malaysia | Chile | Bolivia | Peru | Argentina | Cyprus | India | China | Israel | ... | Tonga | Samoa | Solomon Islands | Tuvalu | Maldives | Nauru | Micronesia (country) | Vanuatu | Palau | Bahrain |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| name | |||||||||||||||||||||
| Indonesia | 3.307843e-34 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| Malaysia | 0.000000e+00 | 1.036276e-28 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| Chile | 0.000000e+00 | 0.000000e+00 | 1.383973e-30 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| Bolivia | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.230765e-19 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| Peru | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.888055e-20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Nauru | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.870316e-50 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| Micronesia (country) | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 1.997287e-42 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| Vanuatu | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 1.469473e-61 | 0.000000e+00 | 0.000000 |
| Palau | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.620417e-36 | 0.000000 |
| Bahrain | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000003 |
188 rows × 188 columns
def Betas_i(pays_cible, h):
Wi_pays_cible = Wi(pays_cible, h)
return np.linalg.solve(X.T @ Wi_pays_cible @ X, X.T @ Wi_pays_cible @ y)
Betas_i("France", 1000000)
array([ 1.25866334e+01, -7.28745501e-09, -2.52405679e-01, -2.33875524e+00,
1.56641687e+01])
On obtient les coefficients de la régression linéaire pour la France, qu'on peut comparer à ceux de la régression globale :
results.params
| 0 | |
|---|---|
| const | -2.363059e+01 |
| population | 5.968180e-10 |
| life_expectancy | 8.826256e-02 |
| log_child_mortality | 2.413059e+00 |
| hdi_2023 | 2.564717e+01 |
def predire(pays_cible, h):
Betas_i_pays_cible = Betas_i(pays_cible, h)
return (X @ Betas_i_pays_cible).loc[pays_cible]
predire("France", 1000000)
np.float64(4.645206727664256)
countries.loc["France", target]
np.float64(4.067842)
On est très proche !
y_pred = [predire(pays, 1000000) for pays in countries.index]
countries["predicted_co2_emissions_gwr"] = y_pred
/usr/local/lib/python3.12/dist-packages/geopandas/geodataframe.py:1969: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
residus = countries["co2_emissions"] - countries["predicted_co2_emissions_gwr"]
vmin = min(countries["co2_emissions"].min(), countries["predicted_co2_emissions_gwr"].min())
vmax = max(countries["co2_emissions"].max(), countries["predicted_co2_emissions_gwr"].max())
countries.plot(column="co2_emissions", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant")
countries.plot(column="predicted_co2_emissions_gwr", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant prédites par GWR, pour h=1000000")
countries.plot(column=residus, figsize=(12,6), legend=True, cmap="coolwarm").set_title("Résidus")
Text(0.5, 1.0, 'Résidus')
Calcul du R² et du RMSE
sum_of_squares_errors = (residus**2).sum()
RMSE = np.sqrt(sum_of_squares_errors/len(countries))
print(RMSE)
2.4714162052837563
sum_of_squares_distance_to_mean = ((countries[target]-countries[target].mean())**2).sum()
R_2 = 1 - sum_of_squares_errors / sum_of_squares_distance_to_mean
print(R_2)
0.800751344305064
Adjusted_R_2 = 1 - (1 - R_2) * (len(countries) - 1) / (len(countries) - X.shape[1] - 1)
print(Adjusted_R_2)
0.7952774801376207
On voit que les résultats de la GWR sont biens meilleurs que ceux de la régression globale, car elle s'adapte au contexte local
Carte des betas
betas_i_all_countries = [Betas_i(pays, 1000000) for pays in countries.index]
gdf_betas_i_all_countries = gpd.GeoDataFrame(
betas_i_all_countries,
index=countries.index,
columns=X.columns,
geometry=countries.geometry
)
gdf_betas_i_all_countries
| const | population | life_expectancy | log_child_mortality | hdi_2023 | geometry | |
|---|---|---|---|---|---|---|
| name | ||||||
| Indonesia | 266.116496 | -4.312500e-08 | -4.190585 | -28.178468 | 106.246494 | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... |
| Malaysia | 240.994841 | -5.098763e-08 | -4.208162 | -19.270307 | 131.076197 | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... |
| Chile | -67.635768 | -8.604978e-09 | 0.166714 | 12.428998 | 58.611579 | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... |
| Bolivia | -19.550895 | 1.265115e-09 | -0.112319 | 3.579118 | 34.188579 | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... |
| Peru | -4.188316 | -8.626947e-09 | -0.215201 | 1.480612 | 28.110437 | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
| ... | ... | ... | ... | ... | ... | ... |
| Nauru | 17.092764 | -3.923703e-08 | -0.338677 | -0.694999 | 13.095048 | POLYGON ((166.93881 -0.49041, 166.95558 -0.497... |
| Micronesia (country) | -21.695788 | 2.087967e-08 | 0.049062 | -1.182948 | 34.391673 | MULTIPOLYGON (((163.02605 5.34089, 163.03045 5... |
| Vanuatu | 16.979232 | 2.980912e-07 | -0.255604 | -3.199578 | 7.853731 | MULTIPOLYGON (((169.84034 -20.1408, 169.86052 ... |
| Palau | -1.895352 | -4.024749e-08 | -0.099196 | -10.408483 | 42.271629 | MULTIPOLYGON (((134.2715 7.07453, 134.27931 7.... |
| Bahrain | -175.934514 | -1.592962e-08 | 2.032739 | 13.850842 | 28.074399 | MULTIPOLYGON (((50.55161 26.19424, 50.59474 26... |
188 rows × 6 columns
for var in X.columns:
gdf_betas_i_all_countries.plot(column=var, figsize=(12,6), legend=True).set_title(
f"Beta_{var}(i)"
)
Impact du changement de valeur de bande passante h
y_pred = [predire(pays, 5000000) for pays in countries.index] # h=5000km
countries["predicted_co2_emissions_gwr"] = y_pred
/usr/local/lib/python3.12/dist-packages/geopandas/geodataframe.py:1969: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
residus = countries["co2_emissions"] - countries["predicted_co2_emissions_gwr"]
vmin = min(countries["co2_emissions"].min(), countries["predicted_co2_emissions_gwr"].min())
vmax = max(countries["co2_emissions"].max(), countries["predicted_co2_emissions_gwr"].max())
countries.plot(column="co2_emissions", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant")
countries.plot(column="predicted_co2_emissions_gwr", figsize=(12,6), legend=True, vmin=vmin, vmax=vmax).set_title("Emissions annuelles de CO₂ par habitant prédites par GWR, pour h=5000000")
countries.plot(column=residus, figsize=(12,6), legend=True, cmap="coolwarm").set_title("Résidus")
Text(0.5, 1.0, 'Résidus')