In [4]:
# Module Imports

import pandas as pd
import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from mappings import country_name_map
from haversine import calculate_served_population, assign_events_to_cities

In [5]:
# Data Imports

vdem = pd.read_csv("./Data/vdem.csv")
hdi = pd.read_excel("./Data/hdi.xlsx")
flights = pd.read_csv("./Data/routes.csv")
airports = pd.read_csv("./Data/airports.csv")
wb = pd.read_csv("./Data/wb.csv")
acled = pd.read_csv("./Data/acled.csv")
cities = pd.read_csv("./Data/cities.csv", sep=";") 


  acled = pd.read_csv("./Data/acled.csv")


First we'll start of by cleaning and organizing the three most important datasets: flights, cities, and acled. 

In [47]:
# First we have to merge the flights and airports datasets to get the number of flights per airport by City.

# Make a copy of the flights dataframe to avoid modifying the original
flights_count = flights.copy()

# Group by Origin airport and count flights
flights_count = (
    flights_count.groupby("Origin")
    .agg(Number_of_Flights=("Origin", "count"))
    .reset_index()
)

# First, merge flights_count with airports to get city information
airports_with_flights = pd.merge(
    flights_count,
    airports,
    left_on='Origin',
    right_on='IATA',
    how='left')
airports_with_flights


Unnamed: 0,Origin,Number_of_Flights,0,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz Database,Type,Source
0,AAE,9,220.0,Rabah Bitat Airport,Annaba,Algeria,AAE,DABB,36.822201,7.809174,16.0,1,N,Africa/Algiers,airport,OurAirports
1,AAL,20,628.0,Aalborg Airport,Aalborg,Denmark,AAL,EKYT,57.092759,9.849243,10.0,1,E,Europe/Copenhagen,airport,OurAirports
2,AAN,2,5937.0,Al Ain International Airport,Al Ain,United Arab Emirates,AAN,OMAL,24.261700,55.609200,869.0,4,U,Asia/Dubai,airport,OurAirports
3,AAQ,3,4353.0,Anapa Vityazevo Airport,Anapa,Russia,AAQ,URKA,45.002102,37.347301,174.0,3,N,Europe/Moscow,airport,OurAirports
4,AAR,8,607.0,Aarhus Airport,Aarhus,Denmark,AAR,EKAH,56.299999,10.619000,82.0,1,E,Europe/Copenhagen,airport,OurAirports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3404,ZUH,60,6355.0,Zhuhai Jinwan Airport,Zhuhai,China,ZUH,ZGSD,22.006399,113.375999,23.0,8,U,Asia/Shanghai,airport,OurAirports
3405,ZUM,2,5550.0,Churchill Falls Airport,Churchill Falls,Canada,ZUM,CZUM,53.561901,-64.106400,1442.0,-4,A,America/Halifax,airport,OurAirports
3406,ZVK,3,3118.0,Savannakhet Airport,Savannakhet,Laos,ZVK,VLSK,16.556601,104.760002,509.0,7,U,Asia/Vientiane,airport,OurAirports
3407,ZYI,15,9846.0,Zunyi Xinzhou Airport,Zunyi,China,ZYI,ZUZY,27.589500,107.000700,2920.0,8,N,Asia/Shanghai,airport,OurAirports


In [None]:
# Next we do the initial cleanup for our Cities dataset

# Split up the coordinates into latitude and longitude
cities['Latitude'] = cities['Coordinates'].str.split(',', expand=True)[0].astype(float)
cities['Longitude'] = cities['Coordinates'].str.split(',', expand=True)[1].str.strip().astype(float)

# Apply country name mapping to standardize country names

# Create a copy of the country_name_map dictionary for additional mappings
extended_country_map = country_name_map.copy()

# Add specific mappings for problematic countries with commas (had to do this twice for some reason but it's ok)
problematic_countries = {
    'Taiwan, China': 'Taiwan',
    'Tanzania, United Republic of': 'Tanzania',
    'Sudan, The Republic of': 'Sudan',
    "Korea, Dem. People's Rep. of": 'North Korea',
    'Korea, Republic of': 'South Korea',
    'Moldova, Republic of': 'Moldova',
    'Macedonia, The former Yugoslav Rep. of': 'North Macedonia',
    'Iran, Islamic Rep. of': 'Iran',
    'Congo, Democratic Republic of the': 'Democratic Republic of the Congo',
    'Venezuela, Bolivarian Rep. of': 'Venezuela',
    'South Sudan, The Republic of': 'South Sudan',
    'Macau, China': 'Macau'
}




# Update our mapping dictionary with these specific cases
extended_country_map.update(problematic_countries)

# Now apply the extended mapping
cities["Country"] = cities["Country name EN"].replace(extended_country_map)

cities_to_drop = ['New York City', 'London', 'Hong Kong']
cities = cities[~cities['ASCII Name'].isin(cities_to_drop)]

# Create the population dataframe that will be used for matching in later cells
population_locations = cities[['Country', 'ASCII Name', 'Population', 'Latitude', 'Longitude']].copy()

# Display the first few rows
population_locations

Unnamed: 0,Country,ASCII Name,Population,Latitude,Longitude
0,Peru,Huayllati,515,-13.92862,-72.48496
1,Peru,Duraznopampa,249,-6.59306,-77.80806
2,Peru,Kimbiri,4369,-12.61935,-73.78814
3,Peru,Urb. Santo Domingo,5000,-11.87655,-77.03345
4,Peru,Cono Norte,14542,-16.30111,-71.61647
...,...,...,...,...,...
147038,Hong Kong,Chak On Estate,3913,22.34029,114.16454
147039,Hong Kong,Hing Man Estate,5990,22.26682,114.23270
147040,Hong Kong,Baguio Villa,5339,22.26234,114.13364
147041,Hong Kong,Tsui Chuk Garden,10071,22.34639,114.18766


In [49]:
DISTANCE_PARAMETER = 50 #km

served_populations = calculate_served_population(
    airports_df=airports_with_flights,
    cities_df=population_locations,
    distance_threshold_km=DISTANCE_PARAMETER,
    device="cpu",
    batch_size=2000,
)

airports_with_flights['Served_Population'] = served_populations.astype(int)


üìä Assigning using weighted score: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 74/74 [00:37<00:00,  1.97it/s]


In [50]:
# Group by city AND country to count airports and aggregate flight data

flights_by_city = airports_with_flights.groupby(['City', 'Country']).agg(
    Number_of_Flights=('Number_of_Flights', 'sum'),  # Sum all flights from all airports in the city
    Airports=('IATA', 'count'),  # Count number of airports in the city
    Latitude=('Latitude', 'mean'),  # Average latitude of all airports in the city
    Longitude=('Longitude', 'mean'),  # Average longitude of all airports in the city
    Served_Population=('Served_Population', 'sum')
).reset_index()


# Now make sure the dataset conforms to our country name mapping conventions
flights_by_city['Country'] = flights_by_city['Country'].replace(country_name_map)
flights_by_city = flights_by_city[flights_by_city['Served_Population'] != 0]
flights_by_city

Unnamed: 0,City,Country,Number_of_Flights,Airports,Latitude,Longitude,Served_Population
0,Aalborg,Denmark,20,1,57.092759,9.849243,343777
1,Aarhus,Denmark,8,1,56.299999,10.619000,563419
3,Abadan,Iran,6,1,30.371099,48.228298,700786
4,Abakan,Russia,4,1,53.740002,91.385002,358031
5,Abbotsford,Canada,2,1,49.025299,-122.361000,288453
...,...,...,...,...,...,...,...
3176,Zurich,Switzerland,247,1,47.464699,8.549170,3466448
3177,Zweibruecken,Germany,5,1,49.209400,7.400560,276108
3178,√Ñngelholm,Sweden,3,1,56.296101,12.847100,282496
3179,√áorlu,Turkey,1,1,41.138199,27.919100,448217


In [51]:
# And the same for ACLED

acled.loc[acled["admin1"] == "Hong Kong", "country"] = "Hong Kong"        
acled["Country"] = acled["country"].replace(country_name_map)
acled["Latitude"] = acled["latitude"]
acled["Longitude"] = acled["longitude"]
acled_filtered = acled[["Country", "Latitude", "Longitude"]]
acled_filtered

Unnamed: 0,Country,Latitude,Longitude
0,Bolivia,-16.3991,-67.7190
1,Yemen,14.1381,43.9717
2,Yemen,16.6537,43.2992
3,Yemen,17.0190,43.2983
4,Yemen,16.7910,43.3719
...,...,...,...
428673,India,32.8083,74.8958
428674,South Africa,-33.9502,18.5545
428675,France,-17.5343,-149.5654
428676,Israel,31.7690,35.2163


In [53]:
# Constants

DISTANCE_PARAMETER = 50 #km

flights_events = assign_events_to_cities(flights_by_city, acled_filtered, 
                                                DISTANCE_PARAMETER, device="cpu")


üìç Assigning events to cities: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [00:04<00:00,  9.65it/s]


In [None]:
wb = wb[wb["2019 [YR2019]"] != ".."]  # Filter out rows with missing 2019 data
wb = wb[wb["Series Name"].isin(["Unemployment, total (% of total labor force) (modeled ILO estimate)",
                                "Population, total",
                                "GDP per capita (current US$)",
                                "Land area (sq. km)"])]  # Filter for required indicators
wb = wb.pivot(index="Country Name", columns="Series Name", values="2019 [YR2019]")  # Pivot the table
wb.reset_index(inplace=True)  # Reset index for cleaner DataFrame

# Clean and prepare World Bank data
# Converting string values to numeric for calculations
wb['GDP per capita (current US$)'] = pd.to_numeric(wb['GDP per capita (current US$)'], errors='coerce')
wb['Land area (sq. km)'] = pd.to_numeric(wb['Land area (sq. km)'], errors='coerce')
wb['Unemployment, total (% of total labor force) (modeled ILO estimate)'] = pd.to_numeric(wb['Unemployment, total (% of total labor force) (modeled ILO estimate)'], errors='coerce')

wb['Country Name'] = wb['Country Name'].replace(country_name_map)

gdp_map = wb.set_index('Country Name')['GDP per capita (current US$)'].to_dict()
land_area_map = wb.set_index('Country Name')['Land area (sq. km)'].to_dict()
unemployment_map = wb.set_index('Country Name')['Unemployment, total (% of total labor force) (modeled ILO estimate)'].to_dict()

flights_events_wb = flights_events.copy()

flights_events_wb['GDP_per_capita'] = flights_events['Country'].map(gdp_map)
flights_events_wb['Land_area'] = flights_events['Country'].map(land_area_map)
flights_events_wb['Unemployment'] = flights_events['Country'].map(unemployment_map)


In [58]:
vdem = vdem[vdem['year'] == 2021]

# Apply the mapping to the Country column
vdem['country_name'] = vdem['country_name'].replace(country_name_map)

vdem_rename_map = {
    'v2x_regime': 'Regime Type',
    'v2x_polyarchy': 'Electoral Democracy Index',
    'v2x_libdem': 'Liberal Democracy Index',
    'v2x_egaldem': 'Egalitarian Democracy Index',
    'v2x_freexp_altinf': 'Freedom of Expression & Alt Info',
    'v2xcl_rol': 'Rule of Law Index',
    'v2xcs_ccsi': 'Core Civil Society Index',
    'v2x_cspart': 'Civil Society Participation',
    'v2x_clphy': 'Freedom from Political Violence',
    'v2xps_party': 'Party System Institutionalization',
    'v2x_execorr': 'Executive Corruption Index',
    'v2x_corr': 'Political Corruption Index',
    'v2x_suffr': 'Suffrage'
}
vdem = vdem.rename(columns=vdem_rename_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vdem['country_name'] = vdem['country_name'].replace(country_name_map)


In [59]:
freedom_expr_map = vdem.set_index('country_name')['Freedom of Expression & Alt Info'].to_dict()
civil_society_map = vdem.set_index('country_name')['Core Civil Society Index'].to_dict()
flights_events_vdem = flights_events_wb.copy()
flights_events_vdem['Freedom_of_Expression'] = flights_events_vdem['Country'].map(freedom_expr_map)
flights_events_vdem['Civil_Society_Index'] = flights_events_vdem['Country'].map(civil_society_map)

In [60]:
hdi = hdi.pivot(index=["country"], columns="indicator", values="value").reset_index()

In [None]:
# Create a dictionary mapping countries to their HDI values
hdi_map = hdi.set_index('country')['Human Development Index (value)'].to_dict()
life_expectancy_map = hdi.set_index('country')['Life Expectancy at Birth (years)'].to_dict()
schooling_map = hdi.set_index('country')['Mean Years of Schooling (years)'].to_dict()

# Replace country names in the maps if needed
for old_name, new_name in country_name_map.items():
    if old_name in hdi_map:
        hdi_map[new_name] = hdi_map.pop(old_name)
    if old_name in life_expectancy_map:
        life_expectancy_map[new_name] = life_expectancy_map.pop(old_name)
    if old_name in schooling_map:
        schooling_map[new_name] = schooling_map.pop(old_name)

flights_events_hdi = flights_events_vdem.copy()

# Add HDI, Life Expectancy, and Mean Years of Schooling to cities dataframe
flights_events_hdi['HDI'] = flights_events_hdi['Country'].map(hdi_map)
flights_events_hdi['Life_Expectancy'] = flights_events_hdi['Country'].map(life_expectancy_map)
flights_events_hdi['Mean_Schooling_Years'] = flights_events_hdi['Country'].map(schooling_map)

In [62]:
clean_data = flights_events_hdi.drop(columns=["Longitude", "Latitude", "Airports"])

taiwan_data = {
    'GDP_per_capita': 33000,
    'Land_area': 36197,
    'Unemployment': 3.4,
    'Freedom_of_Expression': 0.80,
    'Civil_Society_Index': 0.82,
    'HDI': 0.915,
    'Life_Expectancy': 80.5,
    'Mean_Schooling_Years': 12.3
}

for col, val in taiwan_data.items():
    clean_data.loc[clean_data['Country'] == 'Taiwan', col] = val

clean_data

Unnamed: 0,City,Country,Number_of_Flights,Served_Population,Number_of_Events,GDP_per_capita,Land_area,Unemployment,Freedom_of_Expression,Civil_Society_Index,HDI,Life_Expectancy,Mean_Schooling_Years
0,Aalborg,Denmark,20,343777,57,63064.634313,1396.00,,0.985,0.969,0.959,81.291,13.027321
1,Aarhus,Denmark,8,563419,110,63064.634313,1396.00,,0.985,0.969,0.959,81.291,13.027321
3,Abadan,Iran,6,700786,66,3831.278948,1622500.00,10.740,0.166,0.081,0.793,76.799,10.846712
4,Abakan,Russia,4,358031,2,11447.701172,16376870.00,4.513,0.267,0.246,0.826,72.517,12.410000
5,Abbotsford,Canada,2,288453,17,46352.869345,8965590.00,5.690,0.939,0.874,0.935,81.249,13.870000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3176,Zurich,Switzerland,247,3466448,61,84121.931030,39509.63,4.394,0.978,0.954,0.966,83.200,13.949121
3177,Zweibruecken,Germany,5,276108,61,47623.865607,349390.00,3.163,0.955,0.897,0.955,80.580,14.296372
3178,√Ñngelholm,Sweden,3,282496,21,51773.046456,407280.00,6.833,0.969,0.967,0.959,83.046,12.740326
3179,√áorlu,Turkey,1,448217,85,9215.440499,769630.00,13.730,0.189,0.228,0.853,77.591,8.986244
