import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
Chargement des contours des pays¶
countries = gpd.read_file("countries.geojson")
countries.plot()
<Axes: >
countries
| name | ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | |
|---|---|---|---|---|
| 0 | Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... |
| 1 | Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... |
| 2 | Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... |
| 3 | Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... |
| 4 | Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
| ... | ... | ... | ... | ... |
| 253 | Macao S.A.R | MAC | MO | MULTIPOLYGON (((113.5586 22.16303, 113.56943 2... |
| 254 | Ashmore and Cartier Islands | -99 | -99 | POLYGON ((123.59702 -12.42832, 123.59775 -12.4... |
| 255 | Bajo Nuevo Bank (Petrel Is.) | -99 | -99 | POLYGON ((-79.98929 15.79495, -79.98782 15.796... |
| 256 | Serranilla Bank | -99 | -99 | POLYGON ((-78.63707 15.86209, -78.64041 15.864... |
| 257 | Scarborough Reef | -99 | -99 | POLYGON ((117.75389 15.15437, 117.75569 15.151... |
258 rows × 4 columns
On assigne la colonne "name" comme index de la couche des pays
countries = countries.set_index("name")
Les noms ne correspondent pas à ceux dans les données téléchargées sur Our World in Data, on va les renommer
countries = countries.rename(index={
"United States of America": "United States",
"Democratic Republic of the Congo": "Democratic Republic of Congo",
"Ivory Coast": "Cote d\'Ivoire",
"Republic of the Congo": "Congo",
"Czech Republic": "Czechia",
"The Bahamas": "Bahamas",
"Guinea Bissau": "Guinea-Bissau",
"Federated States of Micronesia": "Micronesia (country)",
"Macedonia": "North Macedonia",
"eSwatini": "Eswatini",
"Republic of Serbia": "Serbia",
"United Republic of Tanzania": "Tanzania",
"São Tomé and Principe": "Sao Tome and Principe",
"Cape Verde": "Cabo Verde"
}
)
Chargement des données¶
Cas 1: On a téléchargé des données pour une seule date¶
life_expectancy = pd.read_csv("life-expectancy.csv")
life_expectancy.head()
| Entity | Code | Year | Period life expectancy at birth | time | |
|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 2023 | 66.0346 | 2023 |
| 1 | Albania | ALB | 2023 | 79.6019 | 2023 |
| 2 | Algeria | DZA | 2023 | 76.2610 | 2023 |
| 3 | Andorra | AND | 2023 | 84.0406 | 2023 |
| 4 | Angola | AGO | 2023 | 64.6170 | 2023 |
life_expectancy = life_expectancy.set_index("Entity")
life_expectancy.head()
| Code | Year | Period life expectancy at birth | time | |
|---|---|---|---|---|
| Entity | ||||
| Afghanistan | AFG | 2023 | 66.0346 | 2023 |
| Albania | ALB | 2023 | 79.6019 | 2023 |
| Algeria | DZA | 2023 | 76.2610 | 2023 |
| Andorra | AND | 2023 | 84.0406 | 2023 |
| Angola | AGO | 2023 | 64.6170 | 2023 |
countries["life_expectancy"]=np.arange(len(countries))
countries.head()
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | |
|---|---|---|---|---|
| name | ||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 0 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 1 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 2 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 3 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 4 |
life_expectancy["Period life expectancy at birth"]
| Period life expectancy at birth | |
|---|---|
| Entity | |
| Afghanistan | 66.0346 |
| Albania | 79.6019 |
| Algeria | 76.2610 |
| Andorra | 84.0406 |
| Angola | 64.6170 |
| ... | ... |
| Vietnam | 74.5883 |
| Western Sahara | 71.3850 |
| Yemen | 69.2952 |
| Zambia | 66.3487 |
| Zimbabwe | 62.7748 |
201 rows × 1 columns
countries["life_expectancy"]=life_expectancy["Period life expectancy at birth"]
countries.head()
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | |
|---|---|---|---|---|
| name | ||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 71.1457 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 76.6571 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 81.1667 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 68.5814 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 77.7401 |
countries.plot(column="life_expectancy", figsize=(12,6), legend=True).set_title("life expectancy")
Text(0.5, 1.0, 'life expectancy')
On fait la même chose pour les autres variables étudiées
countries["population"] = pd.read_csv("population.csv").set_index("Entity")["Population (historical)"]
countries["log_population"] = np.log10(countries["population"])
countries.plot(column="log_population", figsize=(12,6), legend=True).set_title("population (log scale)")
Text(0.5, 1.0, 'population (log scale)')
countries["co2_emissions"] = pd.read_csv("co-emissions-per-capita.csv").set_index("Entity")["Annual CO₂ emissions (per capita)"]
countries.plot(column="co2_emissions", figsize=(12,6), legend=True).set_title("Emissions annuelles de CO₂ par habitant")
Text(0.5, 1.0, 'Emissions annuelles de CO₂ par habitant')
countries["child_mortality"] = pd.read_csv("child-mortality.csv").set_index("Entity")["Child mortality rate"]
countries.plot(column="child_mortality", figsize=(12,6), legend=True).set_title("Taux de mortalité infantile")
Text(0.5, 1.0, 'Taux de mortalité infantile')
Cas 2 : on a téléchargé la série temporelle¶
hdi = pd.read_csv("human-development-index.csv")
hdi.head()
| Entity | Code | Year | Human Development Index | World regions according to OWID | |
|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 1990 | 0.285 | NaN |
| 1 | Afghanistan | AFG | 1991 | 0.291 | NaN |
| 2 | Afghanistan | AFG | 1992 | 0.301 | NaN |
| 3 | Afghanistan | AFG | 1993 | 0.311 | NaN |
| 4 | Afghanistan | AFG | 1994 | 0.305 | NaN |
hdi_by_year = hdi.pivot_table(index='Entity', columns='Year', values='Human Development Index')
hdi_by_year.head()
| Year | 1990 | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | ... | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Entity | |||||||||||||||||||||
| Afghanistan | 0.285 | 0.291 | 0.301 | 0.311 | 0.305 | 0.329 | 0.334 | 0.338 | 0.338 | 0.347 | ... | 0.497000 | 0.49600 | 0.495000 | 0.4960 | 0.498000 | 0.507000 | 0.501000 | 0.486000 | 0.495000 | 0.496000 |
| Africa | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.536723 | 0.54259 | 0.546639 | 0.5504 | 0.555833 | 0.560354 | 0.559465 | 0.561104 | 0.571485 | 0.576059 |
| Albania | 0.654 | 0.638 | 0.622 | 0.624 | 0.629 | 0.638 | 0.647 | 0.645 | 0.659 | 0.671 | ... | 0.797000 | 0.79700 | 0.797000 | 0.7980 | 0.801000 | 0.805000 | 0.794000 | 0.794000 | 0.806000 | 0.810000 |
| Algeria | 0.595 | 0.596 | 0.601 | 0.603 | 0.603 | 0.608 | 0.615 | 0.624 | 0.634 | 0.643 | ... | 0.732000 | 0.73700 | 0.743000 | 0.7460 | 0.749000 | 0.753000 | 0.742000 | 0.755000 | 0.761000 | 0.763000 |
| Andorra | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.866000 | 0.86900 | 0.872000 | 0.8730 | 0.875000 | 0.876000 | 0.851000 | 0.871000 | 0.893000 | 0.913000 |
5 rows × 34 columns
def function_to_rename_columns_name(column_name):
return f"hdi_{column_name}"
hdi_by_year = hdi_by_year.rename(columns=function_to_rename_columns_name)
hdi_by_year.head()
| Year | hdi_1990 | hdi_1991 | hdi_1992 | hdi_1993 | hdi_1994 | hdi_1995 | hdi_1996 | hdi_1997 | hdi_1998 | hdi_1999 | ... | hdi_2014 | hdi_2015 | hdi_2016 | hdi_2017 | hdi_2018 | hdi_2019 | hdi_2020 | hdi_2021 | hdi_2022 | hdi_2023 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Entity | |||||||||||||||||||||
| Afghanistan | 0.285 | 0.291 | 0.301 | 0.311 | 0.305 | 0.329 | 0.334 | 0.338 | 0.338 | 0.347 | ... | 0.497000 | 0.49600 | 0.495000 | 0.4960 | 0.498000 | 0.507000 | 0.501000 | 0.486000 | 0.495000 | 0.496000 |
| Africa | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.536723 | 0.54259 | 0.546639 | 0.5504 | 0.555833 | 0.560354 | 0.559465 | 0.561104 | 0.571485 | 0.576059 |
| Albania | 0.654 | 0.638 | 0.622 | 0.624 | 0.629 | 0.638 | 0.647 | 0.645 | 0.659 | 0.671 | ... | 0.797000 | 0.79700 | 0.797000 | 0.7980 | 0.801000 | 0.805000 | 0.794000 | 0.794000 | 0.806000 | 0.810000 |
| Algeria | 0.595 | 0.596 | 0.601 | 0.603 | 0.603 | 0.608 | 0.615 | 0.624 | 0.634 | 0.643 | ... | 0.732000 | 0.73700 | 0.743000 | 0.7460 | 0.749000 | 0.753000 | 0.742000 | 0.755000 | 0.761000 | 0.763000 |
| Andorra | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.866000 | 0.86900 | 0.872000 | 0.8730 | 0.875000 | 0.876000 | 0.851000 | 0.871000 | 0.893000 | 0.913000 |
5 rows × 34 columns
hdi_by_year.isna().sum(axis=0)
| 0 | |
|---|---|
| Year | |
| hdi_1990 | 60 |
| hdi_1991 | 60 |
| hdi_1992 | 60 |
| hdi_1993 | 59 |
| hdi_1994 | 59 |
| hdi_1995 | 51 |
| hdi_1996 | 51 |
| hdi_1997 | 51 |
| hdi_1998 | 51 |
| hdi_1999 | 46 |
| hdi_2000 | 24 |
| hdi_2001 | 24 |
| hdi_2002 | 23 |
| hdi_2003 | 22 |
| hdi_2004 | 20 |
| hdi_2005 | 7 |
| hdi_2006 | 7 |
| hdi_2007 | 6 |
| hdi_2008 | 5 |
| hdi_2009 | 6 |
| hdi_2010 | 2 |
| hdi_2011 | 1 |
| hdi_2012 | 1 |
| hdi_2013 | 1 |
| hdi_2014 | 1 |
| hdi_2015 | 1 |
| hdi_2016 | 1 |
| hdi_2017 | 1 |
| hdi_2018 | 1 |
| hdi_2019 | 1 |
| hdi_2020 | 1 |
| hdi_2021 | 1 |
| hdi_2022 | 1 |
| hdi_2023 | 0 |
countries[["hdi_2023", "hdi_2013"]]=hdi_by_year[["hdi_2023", "hdi_2013"]]
countries
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | population | log_population | child_mortality | co2_emissions | hdi_2023 | hdi_2013 | |
|---|---|---|---|---|---|---|---|---|---|---|
| name | ||||||||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 71.1457 | 281190068.0 | 8.449000 | 2.06 | 2.711182 | 0.728 | 0.690 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 76.6571 | 35126295.0 | 7.545632 | 0.81 | 7.865810 | 0.819 | 0.791 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 81.1667 | 19658833.0 | 7.293558 | 0.72 | 3.947550 | 0.878 | 0.845 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 68.5814 | 12244161.0 | 7.087929 | 2.31 | 2.010797 | 0.733 | 0.699 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 77.7401 | 33845616.0 | 7.529502 | 1.58 | 1.951944 | 0.794 | 0.757 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Macao S.A.R | MAC | MO | MULTIPOLYGON (((113.5586 22.16303, 113.56943 2... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Ashmore and Cartier Islands | -99 | -99 | POLYGON ((123.59702 -12.42832, 123.59775 -12.4... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Bajo Nuevo Bank (Petrel Is.) | -99 | -99 | POLYGON ((-79.98929 15.79495, -79.98782 15.796... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Serranilla Bank | -99 | -99 | POLYGON ((-78.63707 15.86209, -78.64041 15.864... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Scarborough Reef | -99 | -99 | POLYGON ((117.75389 15.15437, 117.75569 15.151... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
258 rows × 10 columns
countries.plot(column='hdi_2023', figsize=(12,6), legend=True).set_title("Human Development Index (2023)")
Text(0.5, 1.0, 'Human Development Index (2023)')
Gestion des valeurs NaN¶
La plupart des méthodes d'analyse statistiques ne supportent pas les NaN, par exemple l'ACP.
Si l'analyse de ces unités spatiales est importante, on peut leur attribuer une valeur, par exemple la moyenne pour chaque variable. Si les variables avec un NaN sont corrélées avec une variable pour laquelle la valeur est connue, on peut inférer avec une régression linéaire.
Dans notre cas, on va simplement ignorer les individus avec des NaN, avec la méthode dropna().
countries = countries.dropna()
countries
| ISO3166-1-Alpha-3 | ISO3166-1-Alpha-2 | geometry | life_expectancy | population | log_population | child_mortality | co2_emissions | hdi_2023 | hdi_2013 | |
|---|---|---|---|---|---|---|---|---|---|---|
| name | ||||||||||
| Indonesia | IDN | ID | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... | 71.1457 | 281190068.0 | 8.449000 | 2.06 | 2.711182 | 0.728 | 0.690 |
| Malaysia | MYS | MY | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 76.6571 | 35126295.0 | 7.545632 | 0.81 | 7.865810 | 0.819 | 0.791 |
| Chile | CHL | CL | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... | 81.1667 | 19658833.0 | 7.293558 | 0.72 | 3.947550 | 0.878 | 0.845 |
| Bolivia | BOL | BO | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... | 68.5814 | 12244161.0 | 7.087929 | 2.31 | 2.010797 | 0.733 | 0.699 |
| Peru | PER | PE | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 77.7401 | 33845616.0 | 7.529502 | 1.58 | 1.951944 | 0.794 | 0.757 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Nauru | NRU | NR | POLYGON ((166.93881 -0.49041, 166.95558 -0.497... | 62.1094 | 11900.0 | 4.075547 | 0.89 | 4.933193 | 0.703 | 0.640 |
| Micronesia (country) | FSM | FM | MULTIPOLYGON (((163.02605 5.34089, 163.03045 5... | 67.1983 | 112646.0 | 5.051716 | 2.31 | 1.278669 | 0.615 | 0.603 |
| Vanuatu | VUT | VU | MULTIPOLYGON (((169.84034 -20.1408, 169.86052 ... | 71.4769 | 320422.0 | 5.505722 | 1.68 | 0.588156 | 0.621 | 0.592 |
| Palau | PLW | PW | MULTIPOLYGON (((134.2715 7.07453, 134.27931 7.... | 69.2690 | 17751.0 | 4.249223 | 2.23 | 12.180102 | 0.786 | 0.791 |
| Bahrain | BHR | BH | MULTIPOLYGON (((50.55161 26.19424, 50.59474 26... | 81.2835 | 1569674.0 | 6.195809 | 0.86 | 24.710030 | 0.899 | 0.846 |
186 rows × 10 columns
countries.plot()
<Axes: >
Exploration du jeu¶
countries.describe()# Affiche des statistiques univariées pour chaque variable
| life_expectancy | population | log_population | child_mortality | co2_emissions | hdi_2023 | hdi_2013 | |
|---|---|---|---|---|---|---|---|
| count | 186.000000 | 1.860000e+02 | 186.000000 | 186.000000 | 186.000000 | 186.000000 | 186.000000 |
| mean | 73.048182 | 4.303034e+07 | 6.854228 | 2.478011 | 4.360875 | 0.740656 | 0.710382 |
| std | 7.025400 | 1.534759e+08 | 0.937179 | 2.469786 | 5.568655 | 0.150070 | 0.154175 |
| min | 54.462300 | 9.837000e+03 | 3.992863 | 0.210000 | 0.055592 | 0.388000 | 0.359000 |
| 25% | 67.450850 | 2.538054e+06 | 6.404222 | 0.610000 | 0.808994 | 0.621250 | 0.583250 |
| 50% | 73.456050 | 9.495406e+06 | 6.977426 | 1.560000 | 2.862610 | 0.762500 | 0.739000 |
| 75% | 78.125250 | 3.354244e+07 | 7.525590 | 3.845000 | 5.346318 | 0.856750 | 0.829250 |
| max | 84.712300 | 1.438070e+09 | 9.157780 | 11.480000 | 40.127865 | 0.972000 | 0.953000 |
pd.plotting.scatter_matrix(countries, figsize=(10, 10))
array([[<Axes: xlabel='life_expectancy', ylabel='life_expectancy'>,
<Axes: xlabel='population', ylabel='life_expectancy'>,
<Axes: xlabel='log_population', ylabel='life_expectancy'>,
<Axes: xlabel='child_mortality', ylabel='life_expectancy'>,
<Axes: xlabel='co2_emissions', ylabel='life_expectancy'>,
<Axes: xlabel='hdi_2023', ylabel='life_expectancy'>,
<Axes: xlabel='hdi_2013', ylabel='life_expectancy'>],
[<Axes: xlabel='life_expectancy', ylabel='population'>,
<Axes: xlabel='population', ylabel='population'>,
<Axes: xlabel='log_population', ylabel='population'>,
<Axes: xlabel='child_mortality', ylabel='population'>,
<Axes: xlabel='co2_emissions', ylabel='population'>,
<Axes: xlabel='hdi_2023', ylabel='population'>,
<Axes: xlabel='hdi_2013', ylabel='population'>],
[<Axes: xlabel='life_expectancy', ylabel='log_population'>,
<Axes: xlabel='population', ylabel='log_population'>,
<Axes: xlabel='log_population', ylabel='log_population'>,
<Axes: xlabel='child_mortality', ylabel='log_population'>,
<Axes: xlabel='co2_emissions', ylabel='log_population'>,
<Axes: xlabel='hdi_2023', ylabel='log_population'>,
<Axes: xlabel='hdi_2013', ylabel='log_population'>],
[<Axes: xlabel='life_expectancy', ylabel='child_mortality'>,
<Axes: xlabel='population', ylabel='child_mortality'>,
<Axes: xlabel='log_population', ylabel='child_mortality'>,
<Axes: xlabel='child_mortality', ylabel='child_mortality'>,
<Axes: xlabel='co2_emissions', ylabel='child_mortality'>,
<Axes: xlabel='hdi_2023', ylabel='child_mortality'>,
<Axes: xlabel='hdi_2013', ylabel='child_mortality'>],
[<Axes: xlabel='life_expectancy', ylabel='co2_emissions'>,
<Axes: xlabel='population', ylabel='co2_emissions'>,
<Axes: xlabel='log_population', ylabel='co2_emissions'>,
<Axes: xlabel='child_mortality', ylabel='co2_emissions'>,
<Axes: xlabel='co2_emissions', ylabel='co2_emissions'>,
<Axes: xlabel='hdi_2023', ylabel='co2_emissions'>,
<Axes: xlabel='hdi_2013', ylabel='co2_emissions'>],
[<Axes: xlabel='life_expectancy', ylabel='hdi_2023'>,
<Axes: xlabel='population', ylabel='hdi_2023'>,
<Axes: xlabel='log_population', ylabel='hdi_2023'>,
<Axes: xlabel='child_mortality', ylabel='hdi_2023'>,
<Axes: xlabel='co2_emissions', ylabel='hdi_2023'>,
<Axes: xlabel='hdi_2023', ylabel='hdi_2023'>,
<Axes: xlabel='hdi_2013', ylabel='hdi_2023'>],
[<Axes: xlabel='life_expectancy', ylabel='hdi_2013'>,
<Axes: xlabel='population', ylabel='hdi_2013'>,
<Axes: xlabel='log_population', ylabel='hdi_2013'>,
<Axes: xlabel='child_mortality', ylabel='hdi_2013'>,
<Axes: xlabel='co2_emissions', ylabel='hdi_2013'>,
<Axes: xlabel='hdi_2023', ylabel='hdi_2013'>,
<Axes: xlabel='hdi_2013', ylabel='hdi_2013'>]], dtype=object)
countries.iloc[:,3:].corr()#J'enlève les colones de code et de géométrie pour calculer les corrélations
| life_expectancy | population | log_population | child_mortality | co2_emissions | hdi_2023 | hdi_2013 | |
|---|---|---|---|---|---|---|---|
| life_expectancy | 1.000000 | 0.014349 | -0.002132 | -0.877801 | 0.481439 | 0.904435 | 0.900622 |
| population | 0.014349 | 1.000000 | 0.441910 | -0.003487 | 0.010886 | -0.011176 | -0.038145 |
| log_population | -0.002132 | 0.441910 | 1.000000 | 0.148135 | -0.071339 | -0.113981 | -0.124327 |
| child_mortality | -0.877801 | -0.003487 | 0.148135 | 1.000000 | -0.421016 | -0.862051 | -0.861931 |
| co2_emissions | 0.481439 | 0.010886 | -0.071339 | -0.421016 | 1.000000 | 0.540600 | 0.538467 |
| hdi_2023 | 0.904435 | -0.011176 | -0.113981 | -0.862051 | 0.540600 | 1.000000 | 0.987957 |
| hdi_2013 | 0.900622 | -0.038145 | -0.124327 | -0.861931 | 0.538467 | 0.987957 | 1.000000 |
ACP¶
Centrer-réduire les variables :
countries_cr = (countries.iloc[:,3:] - countries.iloc[:,3:].mean(axis=0))/countries.iloc[:,3:].std(axis=0)
countries_cr
| life_expectancy | population | log_population | child_mortality | co2_emissions | hdi_2023 | hdi_2013 | |
|---|---|---|---|---|---|---|---|
| name | |||||||
| Indonesia | -0.270801 | 1.551773 | 1.701673 | -0.169250 | -0.296246 | -0.084333 | -0.132199 |
| Malaysia | 0.513696 | -0.051500 | 0.737750 | -0.675366 | 0.629404 | 0.522051 | 0.522902 |
| Chile | 1.155595 | -0.152281 | 0.468779 | -0.711807 | -0.074223 | 0.915201 | 0.873155 |
| Bolivia | -0.635805 | -0.200593 | 0.249366 | -0.068026 | -0.422019 | -0.051016 | -0.073824 |
| Peru | 0.667851 | -0.059845 | 0.720539 | -0.363599 | -0.432587 | 0.355462 | 0.302373 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| Nauru | -1.557033 | -0.280294 | -2.964942 | -0.642975 | 0.102775 | -0.250923 | -0.456507 |
| Micronesia (country) | -0.832676 | -0.279638 | -1.923339 | -0.068026 | -0.553492 | -0.837316 | -0.696494 |
| Vanuatu | -0.223657 | -0.278284 | -1.438899 | -0.323109 | -0.677492 | -0.797335 | -0.767842 |
| Palau | -0.537931 | -0.280256 | -2.779624 | -0.100418 | 1.404150 | 0.302153 | 0.522902 |
| Bahrain | 1.172221 | -0.270145 | -0.702554 | -0.655122 | 3.654232 | 1.055136 | 0.879641 |
186 rows × 7 columns
from sklearn.decomposition import PCA
pca = PCA()
countries_transformed = pca.fit_transform(countries_cr)
countries_transformed
array([[-0.38782093, 2.2577136 , -0.27033415, ..., -0.1075837 ,
-0.43054918, 0.01581441],
[ 1.20067973, 0.5528224 , 0.16633161, ..., -0.20558946,
-0.20109657, 0.00503812],
[ 1.67981178, 0.33923759, -0.67661612, ..., 0.11003823,
0.21270873, 0.02796701],
...,
[-0.82181178, -1.25417263, -0.44619198, ..., -0.78633218,
0.18600237, -0.02108663],
[ 0.82865724, -2.15334798, 1.48835281, ..., 0.28458681,
-0.27605793, -0.14799218],
[ 2.97659196, -0.46833059, 2.87956824, ..., -0.29321703,
0.35584879, 0.11548285]])
On va remettre countries_transformed dans un geodataframe pour en permettre la visualisation cartographique.
countries_transformed_gdf = gpd.GeoDataFrame(countries_transformed, index=countries.index, geometry=countries.geometry, crs=countries.crs)
countries_transformed_gdf
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | geometry | |
|---|---|---|---|---|---|---|---|---|
| name | ||||||||
| Indonesia | -0.387821 | 2.257714 | -0.270334 | -0.008092 | -0.107584 | -0.430549 | 0.015814 | MULTIPOLYGON (((117.70361 4.16342, 117.70361 4... |
| Malaysia | 1.200680 | 0.552822 | 0.166332 | -0.592792 | -0.205589 | -0.201097 | 0.005038 | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... |
| Chile | 1.679812 | 0.339238 | -0.676616 | -0.487564 | 0.110038 | 0.212709 | 0.027967 | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... |
| Bolivia | -0.473525 | -0.034564 | -0.311355 | -0.194320 | 0.045559 | -0.577952 | 0.034945 | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... |
| Peru | 0.613848 | 0.518250 | -0.737999 | -0.537023 | -0.034877 | 0.158002 | 0.035943 | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Nauru | -0.535845 | -2.413022 | 0.479206 | 2.095558 | -0.443317 | -1.015918 | 0.172378 | POLYGON ((166.93881 -0.49041, 166.95558 -0.497... |
| Micronesia (country) | -1.136291 | -1.647075 | -0.119571 | 1.298190 | -0.484725 | -0.144627 | -0.092071 | MULTIPOLYGON (((163.02605 5.34089, 163.03045 5... |
| Vanuatu | -0.821812 | -1.254173 | -0.446192 | 0.941192 | -0.786332 | 0.186002 | -0.021087 | MULTIPOLYGON (((169.84034 -20.1408, 169.86052 ... |
| Palau | 0.828657 | -2.153348 | 1.488353 | 1.650089 | 0.284587 | -0.276058 | -0.147992 | MULTIPOLYGON (((134.2715 7.07453, 134.27931 7.... |
| Bahrain | 2.976592 | -0.468331 | 2.879568 | -0.114218 | -0.293217 | 0.355849 | 0.115483 | MULTIPOLYGON (((50.55161 26.19424, 50.59474 26... |
186 rows × 8 columns
Taux de variance expliqué cumulé
plt.figure(figsize=(10,10))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
[<matplotlib.lines.Line2D at 0x7fc1904c80e0>]
Là par exemple, la première composante principale explique environ 60% de la variance, et la somme des 3 premières environ 90%. Si on garde 3 composantes, on garde "90% de l'information" en n'utilisant que 3 variables au lieu de 7.
pca.components_[0]
array([ 0.46899691, -0.0141802 , -0.06995472, -0.45681452, 0.31120297,
0.48464288, 0.484295 ])
La première composante oppose l'esperance de vie, les emissions de CO2 et l'hdi au 2 dates d'un côté à la mortalité infantile.
pca.components_[1]
array([ 0.08856495, 0.70726013, 0.7007067 , -0.00339905, 0.02488523,
0.01782879, -0.00088282])
La deuxième se concentre surtout sur la population et log population
countries_transformed_gdf.plot(column=0, figsize=(12,6), legend=True).set_title("Première composante principale")
Text(0.5, 1.0, 'Première composante principale')
On voit que les pays equatoriaux sont associés avec une faible valeur de la première CP, c'est-à-dire à une faible espérance de vie et un faible hdi, et un taux de mortalité infantile plus élevé.
countries_transformed_gdf.plot(column=1, figsize=(12,6), legend=True).set_title("Seconde composante principale")
Text(0.5, 1.0, 'Seconde composante principale')
Avec cette composante principale, des pays comme l'Inde ou la Chine ont des valeurs élevées parce qu'elles ont une population importante.
Valeurs pour la France :
countries_transformed_gdf.loc["France"]
| France | |
|---|---|
| 0 | 2.136315 |
| 1 | 0.983079 |
| 2 | -0.820932 |
| 3 | -0.693335 |
| 4 | 0.250692 |
| 5 | 0.224911 |
| 6 | -0.013748 |
| geometry | MULTIPOLYGON (((-54.111527 2.11427, -54.134908... |
La France a une valeur positive pour la première CP, donc elle a plutôt une bonne espérance de vie, un haut taux d'emissions de CO2... Elle a une valeur au dessus de 0 pour la 2ème CP, mais pas très haute : elle est un peu plus peuplée que la moyenne (68 millions vs 43 millions d'habitants)
Calculer la corrélation entre une variable et une composante principale
np.corrcoef(countries_cr["life_expectancy"], countries_transformed_gdf[0])
array([[1. , 0.94254968],
[0.94254968, 1. ]])
On peut vérifier que c'est la même valeur qu'obtenu avec la formule suivante :
pca.components_[0][0] * np.sqrt(pca.explained_variance_[0])
np.float64(0.9425496753669508)
Cercle des corrélations
plt.figure(figsize =(10, 10))
ax = plt.subplot()
plt.xlim(-1.3,1.3)
plt.ylim(-1.3,1.3)
ax.add_patch(plt.Circle((0, 0), radius = 1, color='black', fill=False))
for i, variable in enumerate(list(countries_cr.columns)):
plt.plot([0, pca.components_[0][i] * np.sqrt(pca.explained_variance_[0])], [0, pca.components_[1][i] * np.sqrt(pca.explained_variance_[1])], color='red')
plt.text(pca.components_[0][i] * np.sqrt(pca.explained_variance_[0]), pca.components_[1][i] * np.sqrt(pca.explained_variance_[1]), variable)
On retrouve les interprétations faites précédemments :
- Les variables hdi 2013 et 2023, l'esperance de vie et les emissions sont corrélées entre elles
- Elles ne sont pas corrélées à la population (angle proche de 90°)
- Elles ont une corrélation négatives avec la mortalité infantile
Classification non supervisée¶
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(countries_transformed_gdf.iloc[:,:3], 'ward')#Sur les 3 premières composantes de l'ACP
plt.figure(figsize=(25, 8))
ax=plt.subplot()
dn = dendrogram(Z)
country_names = countries_transformed_gdf.iloc[dn["leaves"]].index
ax.set_xticklabels(country_names, rotation=90)
# Set colors for the tick labels
colors = ["red" if name == "France" else "black" for name in country_names]
# Apply colors to the tick labels
tick_labels = ax.get_xticklabels()
for tick_label, color in zip(tick_labels, colors):
tick_label.set_color(color)
plt.show()
Les pays les moins similaires sont ceux du groupe orange
from sklearn.cluster import AgglomerativeClustering
cah = AgglomerativeClustering(n_clusters=2, linkage="ward")
cah_class_2 = cah.fit_predict(countries_cr).astype(int)
cah_class_2 = gpd.GeoDataFrame(pd.Series(cah_class_2, index=countries.index), geometry=countries.geometry)
cah_class_2.plot(column=0, figsize=(12,5), legend=True)
<Axes: >
countries_transformed_gdf.plot(kind="scatter", x=0, y=1, c=cah_class_2[0], figsize=(12,5), cmap="viridis")
<Axes: xlabel='0', ylabel='1'>
from sklearn.cluster import AgglomerativeClustering
cah = AgglomerativeClustering(n_clusters=5, linkage="ward")
cah_class_5 = cah.fit_predict(countries_cr).astype(int)
cah_class_5 = gpd.GeoDataFrame(pd.Series(cah_class_5, index=countries.index), geometry=countries.geometry)
cah_class_5.plot(column=0, figsize=(12,5), legend=True)
<Axes: >
countries_transformed_gdf.plot(kind="scatter", x=0, y=1, c=cah_class_5[0], figsize=(12,5), cmap="viridis")
<Axes: xlabel='0', ylabel='1'>
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=2)
k_means_class_2 = k_means.fit_predict(countries_cr).astype(int)
k_means_class_2 = gpd.GeoDataFrame(pd.Series(k_means_class_2, index=countries.index), geometry=countries.geometry)
k_means_class_2.plot(column=0, figsize=(12,5), legend=True)
<Axes: >
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=5)
k_means_class_5 = k_means.fit_predict(countries_cr).astype(int)
k_means_class_5 = gpd.GeoDataFrame(pd.Series(k_means_class_5, index=countries.index), geometry=countries.geometry)
k_means_class_5.plot(column=0, figsize=(12,5), legend=True)
<Axes: >
inertie_totale = ((countries_cr - countries_cr.mean(axis=0))**2).sum().sum()
inertie_totale
np.float64(1295.0)
for i, clustering in enumerate([cah_class_2, k_means_class_2, cah_class_5, k_means_class_5]):
inertie_inter = 0
for c in range(max(clustering[0])+1):
centre_c = countries_cr.loc[clustering[0]==c].mean()
inertie_inter += ((countries_cr.mean(axis=0) - centre_c)**2).sum().sum()
Q = inertie_inter / inertie_totale
print("La qualité du clustering", i, "est", Q)
La qualité du clustering 0 est 0.006609839407358844 La qualité du clustering 1 est 0.005332036190339916 La qualité du clustering 2 est 0.093074028932465 La qualité du clustering 3 est 0.09009296871477224
Classification supervisée¶
countries["political_regime"] = pd.read_csv("political-regime.csv").set_index("Entity")["Political regime"]
countries.plot(column="political_regime", figsize=(12,6), legend=True).set_title("Régime politique")
/usr/local/lib/python3.12/dist-packages/geopandas/geodataframe.py:1968: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
Text(0.5, 1.0, 'Régime politique')
Quel est le nombre de pays par régime politique ?
countries["political_regime"].value_counts()
| count | |
|---|---|
| political_regime | |
| 2.0 | 53 |
| 1.0 | 53 |
| 3.0 | 33 |
| 0.0 | 28 |
for variable in ["life_expectancy", "population", "co2_emissions", "child_mortality", "hdi_2013", "hdi_2023"]:
countries.boxplot(column=variable, by="political_regime", figsize=(12,6))
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
clf = RandomForestClassifier()#construct the classifier
X_train, X_test, y_train, y_test = train_test_split(
countries_cr[~countries["political_regime"].isna()],#On ne prend pas les pays pour lesquels "political_regime" vaut NaN, on pourra justement utiliser le modèle pour essayer de prédire pour ces pays
countries[~countries["political_regime"].isna()]["political_regime"],
train_size=0.7)
clf.fit(X_train, y_train)#train the classifier
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[2 2 5 0]
[4 7 5 0]
[1 4 7 2]
[0 0 3 9]]
precision recall f1-score support
0.0 0.29 0.22 0.25 9
1.0 0.54 0.44 0.48 16
2.0 0.35 0.50 0.41 14
3.0 0.82 0.75 0.78 12
accuracy 0.49 51
macro avg 0.50 0.48 0.48 51
weighted avg 0.51 0.49 0.49 51
Globalement, le modèle n'est pas très bon puisqu'il se trompe pour plus de la moitié des cas. On peut voir que la classe sur laquelle le modèle s'en sort le mieux est "démocratie libérale", avec un f1-score de 78% avec quelques confusion avec la classe "démocratie éléctorale". Au contraire, il a du mal à détecter les autocraties fermées, qui sont minoritaires dans le jeu d'entraînement.
La première cause de ces mauvaises performances est que les variables choisies ne permettent pas assez de séparer les classes. Par ailleurs, le nombre de pays est assez faible ce qui rend l'apprentissage difficile.
results = gpd.GeoDataFrame({"y_pred":y_pred,"y_true":y_test},index=X_test.index, geometry=countries.loc[X_test.index, "geometry"])
results
| y_pred | y_true | geometry | |
|---|---|---|---|
| name | |||
| Costa Rica | 2.0 | 3.0 | MULTIPOLYGON (((-83.6965 10.93659, -83.68687 1... |
| Belgium | 3.0 | 3.0 | POLYGON ((2.5218 51.08754, 2.542 51.09687, 2.5... |
| Mauritania | 1.0 | 1.0 | MULTIPOLYGON (((-8.68238 27.28542, -8.48441 27... |
| Israel | 3.0 | 3.0 | POLYGON ((34.24835 31.21145, 34.2644 31.22419,... |
| South Korea | 3.0 | 3.0 | MULTIPOLYGON (((128.36492 38.62434, 128.39478 ... |
| Malawi | 1.0 | 2.0 | MULTIPOLYGON (((34.96462 -11.57356, 34.65125 -... |
| Spain | 3.0 | 3.0 | MULTIPOLYGON (((-5.34073 35.84736, -5.3629 35.... |
| Austria | 3.0 | 2.0 | POLYGON ((16.94504 48.60417, 16.95434 48.5574,... |
| Cuba | 2.0 | 0.0 | MULTIPOLYGON (((-75.09501 19.89723, -75.09495 ... |
| Eritrea | 2.0 | 0.0 | MULTIPOLYGON (((43.11769 12.70791, 42.90036 12... |
| Jordan | 2.0 | 0.0 | POLYGON ((35.61176 32.6819, 35.61233 32.68154,... |
| Malta | 3.0 | 2.0 | MULTIPOLYGON (((14.54802 35.89004, 14.56316 35... |
| Uruguay | 2.0 | 3.0 | POLYGON ((-57.60279 -30.19052, -57.58684 -30.2... |
| Latvia | 2.0 | 3.0 | POLYGON ((27.35294 57.5276, 27.52817 57.52848,... |
| Peru | 2.0 | 2.0 | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
| Turkey | 2.0 | 1.0 | MULTIPOLYGON (((43.44043 41.10659, 43.43629 41... |
| Finland | 3.0 | 3.0 | MULTIPOLYGON (((28.95408 69.02726, 28.83346 68... |
| Hungary | 2.0 | 1.0 | POLYGON ((22.8776 47.94674, 22.86117 47.93382,... |
| North Macedonia | 2.0 | 2.0 | POLYGON ((20.56715 41.87318, 20.5903 41.85473,... |
| Angola | 2.0 | 1.0 | MULTIPOLYGON (((13.0737 -4.63532, 13.06533 -4.... |
| Laos | 2.0 | 0.0 | POLYGON ((102.11866 22.39755, 102.12542 22.383... |
| Qatar | 0.0 | 0.0 | POLYGON ((50.80787 24.74665, 50.82667 24.74868... |
| Russia | 0.0 | 1.0 | MULTIPOLYGON (((87.81632 49.16584, 87.71638 49... |
| Georgia | 2.0 | 2.0 | POLYGON ((41.5512 42.40646, 41.50017 42.64057,... |
| South Sudan | 0.0 | 0.0 | POLYGON ((35.92084 4.61933, 35.85654 4.6196, 3... |
| Bulgaria | 1.0 | 2.0 | POLYGON ((26.33336 41.71304, 26.29491 41.71032... |
| Mozambique | 0.0 | 1.0 | MULTIPOLYGON (((32.11388 -26.84001, 32.1174 -2... |
| Comoros | 2.0 | 1.0 | MULTIPOLYGON (((43.78875 -12.30804, 43.79705 -... |
| Benin | 0.0 | 1.0 | POLYGON ((3.5964 11.69577, 3.57439 11.67304, 3... |
| El Salvador | 1.0 | 1.0 | MULTIPOLYGON (((-90.09831 13.7314, -90.11431 1... |
| Solomon Islands | 2.0 | 2.0 | MULTIPOLYGON (((160.50367 -11.73602, 160.56446... |
| Mexico | 0.0 | 2.0 | MULTIPOLYGON (((-97.13927 25.96581, -97.16747 ... |
| Myanmar | 1.0 | 0.0 | MULTIPOLYGON (((92.57588 21.97757, 92.58384 21... |
| Syria | 2.0 | 0.0 | POLYGON ((35.75759 32.74435, 35.7842 32.77795,... |
| Panama | 2.0 | 2.0 | MULTIPOLYGON (((-82.5736 9.5762, -82.56188 9.5... |
| Papua New Guinea | 1.0 | 1.0 | MULTIPOLYGON (((140.97446 -2.60052, 140.98732 ... |
| Italy | 3.0 | 3.0 | MULTIPOLYGON (((7.02208 45.92526, 7.06694 45.8... |
| Netherlands | 3.0 | 3.0 | MULTIPOLYGON (((7.19459 53.24502, 7.19747 53.2... |
| Senegal | 1.0 | 2.0 | POLYGON ((-12.26413 14.77494, -12.25651 14.745... |
| Denmark | 3.0 | 3.0 | MULTIPOLYGON (((8.66078 54.89631, 8.66879 54.9... |
| Algeria | 0.0 | 1.0 | POLYGON ((-4.82161 24.99506, -4.99519 25.10209... |
| United Kingdom | 3.0 | 3.0 | MULTIPOLYGON (((-7.2471 55.06932, -7.25674 55.... |
| Jamaica | 2.0 | 2.0 | POLYGON ((-76.26374 18.01236, -76.25678 17.996... |
| Haiti | 1.0 | 0.0 | MULTIPOLYGON (((-71.75744 19.71011, -71.74861 ... |
| Cote d'Ivoire | 1.0 | 1.0 | MULTIPOLYGON (((-7.98966 10.16199, -7.9709 10.... |
| Gabon | 2.0 | 1.0 | MULTIPOLYGON (((13.29457 2.16106, 13.29488 2.1... |
| Iraq | 1.0 | 1.0 | POLYGON ((44.76614 37.14192, 44.75254 37.11314... |
| Togo | 1.0 | 1.0 | POLYGON ((-0.16611 11.13498, -0.11506 11.12466... |
| Poland | 2.0 | 2.0 | POLYGON ((18.8332 49.51026, 18.83743 49.52695,... |
| Guinea-Bissau | 1.0 | 1.0 | MULTIPOLYGON (((-13.72828 12.67339, -13.73631 ... |
| Liberia | 1.0 | 2.0 | POLYGON ((-11.47619 6.91942, -11.44449 6.93394... |
results.plot(column="y_pred", figsize=(12,6), legend=True).set_title("Régime politique prédit")
results.plot(column="y_true", figsize=(12,6), legend=True).set_title("Régime politique réel")
Text(0.5, 1.0, 'Régime politique réel')
On peut utiliser le modèle (même s'il n'est pas très bon) pour prédire le régime politique des pays pour lesquels on a pas de données
predictions_pays_regimes_inconnus = gpd.GeoDataFrame(
{"regime prédit":clf.predict(countries_cr[countries["political_regime"].isna()])},
index=countries[countries["political_regime"].isna()].index,
geometry=countries[countries["political_regime"].isna()].geometry
)
predictions_pays_regimes_inconnus
| regime prédit | geometry | |
|---|---|---|
| name | ||
| Palestine | 1.0 | MULTIPOLYGON (((34.4812 31.58314, 34.52013 31.... |
| Brunei | 2.0 | MULTIPOLYGON (((115.14617 4.90852, 115.1468 4.... |
| Andorra | 3.0 | POLYGON ((1.70701 42.50278, 1.6975 42.49446, 1... |
| Belize | 2.0 | MULTIPOLYGON (((-89.19314 16.39263, -89.18435 ... |
| Bahamas | 3.0 | MULTIPOLYGON (((-72.99957 21.45171, -73.03366 ... |
| Kiribati | 2.0 | MULTIPOLYGON (((173.03826 1.34105, 173.02442 1... |
| Marshall Islands | 1.0 | MULTIPOLYGON (((168.10108 5.59931, 168.10133 5... |
| Grenada | 3.0 | MULTIPOLYGON (((-61.61294 12.21442, -61.60456 ... |
| Saint Vincent and the Grenadines | 3.0 | MULTIPOLYGON (((-61.194 13.03685, -61.20832 13... |
| Saint Lucia | 3.0 | POLYGON ((-60.88679 14.01008, -60.88296 13.980... |
| Dominica | 3.0 | POLYGON ((-61.36286 15.20181, -61.37409 15.204... |
| Antigua and Barbuda | 3.0 | MULTIPOLYGON (((-61.88362 17.04902, -61.87922 ... |
| Saint Kitts and Nevis | 3.0 | MULTIPOLYGON (((-62.59923 17.20295, -62.57844 ... |
| Tonga | 3.0 | MULTIPOLYGON (((-173.95637 -18.56732, -173.941... |
| Samoa | 3.0 | MULTIPOLYGON (((-172.20104 -13.59254, -172.197... |
| Tuvalu | 3.0 | MULTIPOLYGON (((179.19125 -8.54209, 179.20004 ... |
| Nauru | 3.0 | POLYGON ((166.93881 -0.49041, 166.95558 -0.497... |
| Micronesia (country) | 2.0 | MULTIPOLYGON (((163.02605 5.34089, 163.03045 5... |
| Palau | 3.0 | MULTIPOLYGON (((134.2715 7.07453, 134.27931 7.... |
BONUS: probabilités¶
y_proba = clf.predict_proba(X_test)
y_proba
array([[0.12, 0.26, 0.5 , 0.12],
[0.02, 0.05, 0.09, 0.84],
[0.02, 0.65, 0.33, 0. ],
[0.01, 0.02, 0.09, 0.88],
[0. , 0.18, 0.1 , 0.72],
[0.1 , 0.71, 0.19, 0. ],
[0. , 0.08, 0.07, 0.85],
[0.01, 0.02, 0.09, 0.88],
[0.03, 0.29, 0.68, 0. ],
[0.14, 0.23, 0.63, 0. ],
[0.05, 0.34, 0.61, 0. ],
[0.06, 0.03, 0.41, 0.5 ],
[0.01, 0.07, 0.86, 0.06],
[0.01, 0.04, 0.72, 0.23],
[0.21, 0.14, 0.63, 0.02],
[0.31, 0.22, 0.37, 0.1 ],
[0. , 0.1 , 0.03, 0.87],
[0.01, 0.17, 0.74, 0.08],
[0.01, 0.24, 0.67, 0.08],
[0.08, 0.25, 0.67, 0. ],
[0.07, 0.44, 0.47, 0.02],
[0.48, 0.07, 0.23, 0.22],
[0.39, 0.38, 0.18, 0.05],
[0.03, 0.16, 0.71, 0.1 ],
[0.53, 0.3 , 0.17, 0. ],
[0.02, 0.56, 0.39, 0.03],
[0.6 , 0.31, 0.09, 0. ],
[0. , 0.42, 0.44, 0.14],
[0.42, 0.42, 0.16, 0. ],
[0.02, 0.75, 0.23, 0. ],
[0.01, 0.21, 0.56, 0.22],
[0.4 , 0.26, 0.34, 0. ],
[0.12, 0.52, 0.36, 0. ],
[0.04, 0.44, 0.5 , 0.02],
[0.08, 0.34, 0.48, 0.1 ],
[0.03, 0.78, 0.19, 0. ],
[0. , 0.08, 0.05, 0.87],
[0.01, 0. , 0.06, 0.93],
[0.14, 0.59, 0.27, 0. ],
[0. , 0.05, 0.11, 0.84],
[0.37, 0.27, 0.35, 0.01],
[0.06, 0.01, 0.15, 0.78],
[0.01, 0.26, 0.72, 0.01],
[0.04, 0.75, 0.21, 0. ],
[0.23, 0.61, 0.16, 0. ],
[0.01, 0.23, 0.75, 0.01],
[0.26, 0.52, 0.21, 0.01],
[0.03, 0.63, 0.34, 0. ],
[0.12, 0.08, 0.42, 0.38],
[0.02, 0.57, 0.41, 0. ],
[0.06, 0.59, 0.35, 0. ]])
Chaque ligne correspond aux probabilités données par le modèle pour les 4 classes.
results[["proba_0", "proba_1", "proba_2", "proba_3"]] = y_proba
results
| y_pred | y_true | geometry | proba_0 | proba_1 | proba_2 | proba_3 | |
|---|---|---|---|---|---|---|---|
| name | |||||||
| Fiji | 2.0 | 1.0 | MULTIPOLYGON (((-180 -16.16961, -180 -16.14911... | 0.12 | 0.26 | 0.50 | 0.12 |
| Paraguay | 2.0 | 2.0 | POLYGON ((-62.65036 -22.23446, -62.62752 -22.1... | 0.02 | 0.05 | 0.09 | 0.84 |
| Spain | 3.0 | 3.0 | MULTIPOLYGON (((-5.34073 35.84736, -5.3629 35.... | 0.02 | 0.65 | 0.33 | 0.00 |
| Switzerland | 3.0 | 3.0 | POLYGON ((10.45381 46.86443, 10.44854 46.83223... | 0.01 | 0.02 | 0.09 | 0.88 |
| Senegal | 1.0 | 2.0 | POLYGON ((-12.26413 14.77494, -12.25651 14.745... | 0.00 | 0.18 | 0.10 | 0.72 |
| Portugal | 2.0 | 3.0 | MULTIPOLYGON (((-7.10486 38.82719, -7.15101 38... | 0.10 | 0.71 | 0.19 | 0.00 |
| Peru | 2.0 | 2.0 | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... | 0.00 | 0.08 | 0.07 | 0.85 |
| Mali | 0.0 | 0.0 | POLYGON ((-12.26413 14.77494, -12.24679 14.767... | 0.01 | 0.02 | 0.09 | 0.88 |
| Tajikistan | 2.0 | 1.0 | MULTIPOLYGON (((70.5659 41.01838, 70.59345 41.... | 0.03 | 0.29 | 0.68 | 0.00 |
| Nicaragua | 2.0 | 1.0 | MULTIPOLYGON (((-85.70174 11.08088, -85.70242 ... | 0.14 | 0.23 | 0.63 | 0.00 |
| Latvia | 2.0 | 3.0 | POLYGON ((27.35294 57.5276, 27.52817 57.52848,... | 0.05 | 0.34 | 0.61 | 0.00 |
| Germany | 3.0 | 3.0 | MULTIPOLYGON (((13.81572 48.76643, 13.78586 48... | 0.06 | 0.03 | 0.41 | 0.50 |
| Turkmenistan | 1.0 | 0.0 | MULTIPOLYGON (((61.26968 35.6185, 61.24539 35.... | 0.01 | 0.07 | 0.86 | 0.06 |
| Comoros | 1.0 | 1.0 | MULTIPOLYGON (((43.78875 -12.30804, 43.79705 -... | 0.01 | 0.04 | 0.72 | 0.23 |
| Slovenia | 3.0 | 3.0 | POLYGON ((13.64292 45.45943, 13.64282 45.45945... | 0.21 | 0.14 | 0.63 | 0.02 |
| Sweden | 3.0 | 3.0 | MULTIPOLYGON (((20.62316 69.03636, 20.67546 69... | 0.31 | 0.22 | 0.37 | 0.10 |
| France | 3.0 | 3.0 | MULTIPOLYGON (((-54.11153 2.11427, -54.13491 2... | 0.00 | 0.10 | 0.03 | 0.87 |
| Libya | 1.0 | 0.0 | POLYGON ((11.50511 33.18122, 11.52589 33.17695... | 0.01 | 0.17 | 0.74 | 0.08 |
| Saudi Arabia | 2.0 | 0.0 | MULTIPOLYGON (((50.80787 24.74665, 50.88101 24... | 0.01 | 0.24 | 0.67 | 0.08 |
| East Timor | 2.0 | 2.0 | MULTIPOLYGON (((124.45053 -9.18019, 124.4515 -... | 0.08 | 0.25 | 0.67 | 0.00 |
| Philippines | 1.0 | 1.0 | MULTIPOLYGON (((120.86069 5.57437, 120.87306 5... | 0.07 | 0.44 | 0.47 | 0.02 |
| Rwanda | 1.0 | 1.0 | POLYGON ((29.01536 -2.72071, 29.00012 -2.70366... | 0.48 | 0.07 | 0.23 | 0.22 |
| Venezuela | 1.0 | 1.0 | MULTIPOLYGON (((-60.02098 8.55801, -59.9597 8.... | 0.39 | 0.38 | 0.18 | 0.05 |
| Congo | 2.0 | 1.0 | POLYGON ((18.62639 3.47687, 18.63455 3.44922, ... | 0.03 | 0.16 | 0.71 | 0.10 |
| Bahrain | 2.0 | 0.0 | MULTIPOLYGON (((50.55161 26.19424, 50.59474 26... | 0.53 | 0.30 | 0.17 | 0.00 |
| Malaysia | 1.0 | 1.0 | MULTIPOLYGON (((117.70361 4.16342, 117.69711 4... | 0.02 | 0.56 | 0.39 | 0.03 |
| Sudan | 1.0 | 0.0 | MULTIPOLYGON (((22.86106 10.91915, 22.90024 11... | 0.60 | 0.31 | 0.09 | 0.00 |
| Argentina | 2.0 | 2.0 | MULTIPOLYGON (((-67.1939 -22.82222, -67.14269 ... | 0.00 | 0.42 | 0.44 | 0.14 |
| China | 2.0 | 0.0 | MULTIPOLYGON (((78.91769 33.38626, 78.91595 33... | 0.42 | 0.42 | 0.16 | 0.00 |
| Tanzania | 1.0 | 1.0 | MULTIPOLYGON (((32.92086 -9.4079, 32.90546 -9.... | 0.02 | 0.75 | 0.23 | 0.00 |
| Belgium | 3.0 | 3.0 | POLYGON ((2.5218 51.08754, 2.542 51.09687, 2.5... | 0.01 | 0.21 | 0.56 | 0.22 |
| Albania | 2.0 | 2.0 | POLYGON ((20.56715 41.87318, 20.54172 41.86158... | 0.40 | 0.26 | 0.34 | 0.00 |
| Guinea-Bissau | 1.0 | 1.0 | MULTIPOLYGON (((-13.72828 12.67339, -13.73631 ... | 0.12 | 0.52 | 0.36 | 0.00 |
| Zambia | 1.0 | 2.0 | POLYGON ((32.92086 -9.4079, 32.92303 -9.46629,... | 0.04 | 0.44 | 0.50 | 0.02 |
| Moldova | 2.0 | 2.0 | POLYGON ((26.61789 48.25897, 26.61861 48.26718... | 0.08 | 0.34 | 0.48 | 0.10 |
| Jordan | 2.0 | 0.0 | POLYGON ((35.61176 32.6819, 35.61233 32.68154,... | 0.03 | 0.78 | 0.19 | 0.00 |
| New Zealand | 3.0 | 3.0 | MULTIPOLYGON (((166.13697 -50.86435, 166.20525... | 0.00 | 0.08 | 0.05 | 0.87 |
| Morocco | 2.0 | 0.0 | POLYGON ((-8.81704 27.66146, -8.81845 27.6594,... | 0.01 | 0.00 | 0.06 | 0.93 |
| Finland | 3.0 | 3.0 | MULTIPOLYGON (((28.95408 69.02726, 28.83346 68... | 0.14 | 0.59 | 0.27 | 0.00 |
| Iran | 1.0 | 0.0 | MULTIPOLYGON (((44.80699 39.6399, 44.80965 39.... | 0.00 | 0.05 | 0.11 | 0.84 |
| Iceland | 3.0 | 3.0 | MULTIPOLYGON (((-14.56363 66.38451, -14.61075 ... | 0.37 | 0.27 | 0.35 | 0.01 |
| Lesotho | 1.0 | 2.0 | POLYGON ((28.98085 -28.90904, 28.99542 -28.908... | 0.06 | 0.01 | 0.15 | 0.78 |
| Iraq | 1.0 | 1.0 | POLYGON ((44.76614 37.14192, 44.75254 37.11314... | 0.01 | 0.26 | 0.72 | 0.01 |
| Cuba | 2.0 | 0.0 | MULTIPOLYGON (((-75.09501 19.89723, -75.09495 ... | 0.04 | 0.75 | 0.21 | 0.00 |
| Trinidad and Tobago | 2.0 | 2.0 | MULTIPOLYGON (((-61.00227 10.69937, -61.02086 ... | 0.23 | 0.61 | 0.16 | 0.00 |
| Qatar | 2.0 | 0.0 | POLYGON ((50.80787 24.74665, 50.82667 24.74868... | 0.01 | 0.23 | 0.75 | 0.01 |
| Botswana | 2.0 | 2.0 | POLYGON ((25.25978 -17.79411, 25.21937 -17.879... | 0.26 | 0.52 | 0.21 | 0.01 |
| Zimbabwe | 1.0 | 1.0 | POLYGON ((25.25978 -17.79411, 25.2667 -17.8009... | 0.03 | 0.63 | 0.34 | 0.00 |
| El Salvador | 2.0 | 1.0 | MULTIPOLYGON (((-90.09831 13.7314, -90.11431 1... | 0.12 | 0.08 | 0.42 | 0.38 |
| United Arab Emirates | 2.0 | 0.0 | MULTIPOLYGON (((56.27906 25.62745, 56.3003 25.... | 0.02 | 0.57 | 0.41 | 0.00 |
| Netherlands | 3.0 | 3.0 | MULTIPOLYGON (((7.19459 53.24502, 7.19747 53.2... | 0.06 | 0.59 | 0.35 | 0.00 |
for proba in ["proba_0", "proba_1", "proba_2", "proba_3"]:
results.plot(column=proba, figsize=(12,6), legend=True).set_title(proba)