final
This commit is contained in:
428679
Data/acled.csv
Normal file
428679
Data/acled.csv
Normal file
File diff suppressed because one or more lines are too long
7699
Data/airports.csv
Normal file
7699
Data/airports.csv
Normal file
File diff suppressed because it is too large
Load Diff
147044
Data/cities.csv
Normal file
147044
Data/cities.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
Data/hdi.xlsx
Normal file
BIN
Data/hdi.xlsx
Normal file
Binary file not shown.
67664
Data/routes.csv
Normal file
67664
Data/routes.csv
Normal file
File diff suppressed because it is too large
Load Diff
27914
Data/vdem.csv
Normal file
27914
Data/vdem.csv
Normal file
File diff suppressed because one or more lines are too long
1070
Data/wb.csv
Normal file
1070
Data/wb.csv
Normal file
File diff suppressed because it is too large
Load Diff
362
FINAL_r_reg_final.ipynb
Normal file
362
FINAL_r_reg_final.ipynb
Normal file
@@ -0,0 +1,362 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "d0c8ee28",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "r"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"corrplot 0.95 loaded\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"library(pscl)\n",
|
||||||
|
"library(MASS)\n",
|
||||||
|
"library(dplyr)\n",
|
||||||
|
"library(car)\n",
|
||||||
|
"library(corrplot)\n",
|
||||||
|
"df <- read.csv(\"clean_data.csv\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "b2731cb4",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "r"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [,1]\n",
|
||||||
|
"Number_of_Flights_Mean 2.435639e+01\n",
|
||||||
|
"Number_of_Flights_Median 5.000000e+00\n",
|
||||||
|
"Number_of_Flights_SD 6.742273e+01\n",
|
||||||
|
"Number_of_Flights_Min 1.000000e+00\n",
|
||||||
|
"Number_of_Flights_Max 1.226000e+03\n",
|
||||||
|
"Served_Population_Mean 9.937426e+05\n",
|
||||||
|
"Served_Population_Median 2.699935e+05\n",
|
||||||
|
"Served_Population_SD 2.479144e+06\n",
|
||||||
|
"Served_Population_Min 2.300000e+01\n",
|
||||||
|
"Served_Population_Max 3.372453e+07\n",
|
||||||
|
"Number_of_Events_Mean 9.763500e+01\n",
|
||||||
|
"Number_of_Events_Median 1.700000e+01\n",
|
||||||
|
"Number_of_Events_SD 2.590523e+02\n",
|
||||||
|
"Number_of_Events_Min 0.000000e+00\n",
|
||||||
|
"Number_of_Events_Max 6.079000e+03\n",
|
||||||
|
"GDP_per_capita_Mean 2.690592e+04\n",
|
||||||
|
"GDP_per_capita_Median 1.144770e+04\n",
|
||||||
|
"GDP_per_capita_SD 2.487818e+04\n",
|
||||||
|
"GDP_per_capita_Min 2.102365e+02\n",
|
||||||
|
"GDP_per_capita_Max 1.127264e+05\n",
|
||||||
|
"Land_area_Mean 4.148564e+06\n",
|
||||||
|
"Land_area_Median 1.557258e+06\n",
|
||||||
|
"Land_area_SD 4.617248e+06\n",
|
||||||
|
"Land_area_Min 2.000000e+01\n",
|
||||||
|
"Land_area_Max 1.637687e+07\n",
|
||||||
|
"Unemployment_Mean 6.307765e+00\n",
|
||||||
|
"Unemployment_Median 4.560000e+00\n",
|
||||||
|
"Unemployment_SD 4.294981e+00\n",
|
||||||
|
"Unemployment_Min 1.000000e-01\n",
|
||||||
|
"Unemployment_Max 2.846800e+01\n",
|
||||||
|
"Freedom_of_Expression_Mean 6.868388e-01\n",
|
||||||
|
"Freedom_of_Expression_Median 8.080000e-01\n",
|
||||||
|
"Freedom_of_Expression_SD 3.023572e-01\n",
|
||||||
|
"Freedom_of_Expression_Min 1.200000e-02\n",
|
||||||
|
"Freedom_of_Expression_Max 9.850000e-01\n",
|
||||||
|
"Civil_Society_Index_Mean 6.990277e-01\n",
|
||||||
|
"Civil_Society_Index_Median 8.370000e-01\n",
|
||||||
|
"Civil_Society_Index_SD 3.056219e-01\n",
|
||||||
|
"Civil_Society_Index_Min 1.500000e-02\n",
|
||||||
|
"Civil_Society_Index_Max 9.790000e-01\n",
|
||||||
|
"HDI_Mean 8.239568e-01\n",
|
||||||
|
"HDI_Median 8.260000e-01\n",
|
||||||
|
"HDI_SD 1.211331e-01\n",
|
||||||
|
"HDI_Min 3.850000e-01\n",
|
||||||
|
"HDI_Max 9.670000e-01\n",
|
||||||
|
"Life_Expectancy_Mean 7.612774e+01\n",
|
||||||
|
"Life_Expectancy_Median 7.797900e+01\n",
|
||||||
|
"Life_Expectancy_SD 5.569156e+00\n",
|
||||||
|
"Life_Expectancy_Min 1.881800e+01\n",
|
||||||
|
"Life_Expectancy_Max 8.405400e+01\n",
|
||||||
|
"Mean_Schooling_Years_Mean 1.069395e+01\n",
|
||||||
|
"Mean_Schooling_Years_Median 1.118458e+01\n",
|
||||||
|
"Mean_Schooling_Years_SD 2.839204e+00\n",
|
||||||
|
"Mean_Schooling_Years_Min 1.412289e+00\n",
|
||||||
|
"Mean_Schooling_Years_Max 1.429637e+01\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"numeric_vars <- df %>%\n",
|
||||||
|
" select(where(is.numeric))\n",
|
||||||
|
"\n",
|
||||||
|
"summary_stats <- numeric_vars %>%\n",
|
||||||
|
" summarise(across(everything(),\n",
|
||||||
|
" list(Mean = ~mean(., na.rm = TRUE),\n",
|
||||||
|
" Median = ~median(., na.rm = TRUE),\n",
|
||||||
|
" SD = ~sd(., na.rm = TRUE),\n",
|
||||||
|
" Min = ~min(., na.rm = TRUE),\n",
|
||||||
|
" Max = ~max(., na.rm = TRUE)),\n",
|
||||||
|
" .names = \"{.col}_{.fn}\"))\n",
|
||||||
|
"\n",
|
||||||
|
"summary_stats_t <- t(summary_stats)\n",
|
||||||
|
"print(summary_stats_t)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "965aa97f",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "r"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Served_Population Number_of_Events GDP_per_capita \n",
|
||||||
|
" 1.424350 1.430105 4.591687 \n",
|
||||||
|
" Land_area Unemployment Freedom_of_Expression \n",
|
||||||
|
" 1.574135 1.270525 14.585125 \n",
|
||||||
|
" Civil_Society_Index HDI Life_Expectancy \n",
|
||||||
|
" 13.931561 30.454321 9.553996 \n",
|
||||||
|
" Mean_Schooling_Years \n",
|
||||||
|
" 11.736238 \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"vif_model <- lm(numeric_vars[[1]] ~ ., data = numeric_vars[, -1])\n",
|
||||||
|
"vif_values <- vif(vif_model)\n",
|
||||||
|
"print(vif_values)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "d97fa5af",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "r"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Number_of_Flights Served_Population Number_of_Events\n",
|
||||||
|
"Number_of_Flights 1.000000000 0.671699241 0.367851267\n",
|
||||||
|
"Served_Population 0.671699241 1.000000000 0.506996810\n",
|
||||||
|
"Number_of_Events 0.367851267 0.506996810 1.000000000\n",
|
||||||
|
"GDP_per_capita 0.083266657 -0.107883557 -0.097270848\n",
|
||||||
|
"Land_area 0.007304305 -0.009571527 -0.159943385\n",
|
||||||
|
"Unemployment -0.039639363 -0.026894147 0.077944485\n",
|
||||||
|
"Freedom_of_Expression -0.013162926 -0.154830547 0.021788094\n",
|
||||||
|
"Civil_Society_Index -0.015615442 -0.153716921 0.004262525\n",
|
||||||
|
"HDI 0.102218228 -0.075387890 -0.098668654\n",
|
||||||
|
"Life_Expectancy 0.111275449 -0.035340725 -0.048171784\n",
|
||||||
|
"Mean_Schooling_Years 0.067418917 -0.112865885 -0.111281573\n",
|
||||||
|
" GDP_per_capita Land_area Unemployment\n",
|
||||||
|
"Number_of_Flights 0.08326666 0.007304305 -0.03963936\n",
|
||||||
|
"Served_Population -0.10788356 -0.009571527 -0.02689415\n",
|
||||||
|
"Number_of_Events -0.09727085 -0.159943385 0.07794448\n",
|
||||||
|
"GDP_per_capita 1.00000000 0.318471442 -0.30005120\n",
|
||||||
|
"Land_area 0.31847144 1.000000000 -0.20297109\n",
|
||||||
|
"Unemployment -0.30005120 -0.202971089 1.00000000\n",
|
||||||
|
"Freedom_of_Expression 0.59430211 -0.143566301 -0.01042337\n",
|
||||||
|
"Civil_Society_Index 0.58418574 -0.125211593 -0.01680905\n",
|
||||||
|
"HDI 0.79047286 0.287845223 -0.14071643\n",
|
||||||
|
"Life_Expectancy 0.62585773 0.116634966 -0.09027353\n",
|
||||||
|
"Mean_Schooling_Years 0.79576076 0.337944765 -0.22155575\n",
|
||||||
|
" Freedom_of_Expression Civil_Society_Index HDI\n",
|
||||||
|
"Number_of_Flights -0.01316293 -0.015615442 0.10221823\n",
|
||||||
|
"Served_Population -0.15483055 -0.153716921 -0.07538789\n",
|
||||||
|
"Number_of_Events 0.02178809 0.004262525 -0.09866865\n",
|
||||||
|
"GDP_per_capita 0.59430211 0.584185742 0.79047286\n",
|
||||||
|
"Land_area -0.14356630 -0.125211593 0.28784522\n",
|
||||||
|
"Unemployment -0.01042337 -0.016809049 -0.14071643\n",
|
||||||
|
"Freedom_of_Expression 1.00000000 0.961643751 0.42397498\n",
|
||||||
|
"Civil_Society_Index 0.96164375 1.000000000 0.38741832\n",
|
||||||
|
"HDI 0.42397498 0.387418317 1.00000000\n",
|
||||||
|
"Life_Expectancy 0.29538471 0.253887855 0.88027648\n",
|
||||||
|
"Mean_Schooling_Years 0.47456356 0.445093459 0.90374489\n",
|
||||||
|
" Life_Expectancy Mean_Schooling_Years\n",
|
||||||
|
"Number_of_Flights 0.11127545 0.06741892\n",
|
||||||
|
"Served_Population -0.03534072 -0.11286588\n",
|
||||||
|
"Number_of_Events -0.04817178 -0.11128157\n",
|
||||||
|
"GDP_per_capita 0.62585773 0.79576076\n",
|
||||||
|
"Land_area 0.11663497 0.33794476\n",
|
||||||
|
"Unemployment -0.09027353 -0.22155575\n",
|
||||||
|
"Freedom_of_Expression 0.29538471 0.47456356\n",
|
||||||
|
"Civil_Society_Index 0.25388786 0.44509346\n",
|
||||||
|
"HDI 0.88027648 0.90374489\n",
|
||||||
|
"Life_Expectancy 1.00000000 0.65439475\n",
|
||||||
|
"Mean_Schooling_Years 0.65439475 1.00000000\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"numeric_complete <- na.omit(numeric_vars)\n",
|
||||||
|
"cor_matrix <- cor(numeric_complete)\n",
|
||||||
|
"print(cor_matrix)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"id": "8d22a613",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "r"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"\n",
|
||||||
|
"Call:\n",
|
||||||
|
"glm.nb(formula = Number_of_Events ~ ., data = df_model, control = glm.control(maxit = 100), \n",
|
||||||
|
" init.theta = 0.4173593549, link = log)\n",
|
||||||
|
"\n",
|
||||||
|
"Coefficients:\n",
|
||||||
|
" Estimate Std. Error z value Pr(>|z|) \n",
|
||||||
|
"(Intercept) 5.271e+00 2.296e-01 22.952 < 2e-16 ***\n",
|
||||||
|
"Number_of_Flights 4.101e-03 6.156e-04 6.661 2.72e-11 ***\n",
|
||||||
|
"HDI -2.751e+00 3.066e-01 -8.973 < 2e-16 ***\n",
|
||||||
|
"Served_Population 3.587e-07 1.679e-08 21.357 < 2e-16 ***\n",
|
||||||
|
"Land_area -1.192e-07 7.361e-09 -16.190 < 2e-16 ***\n",
|
||||||
|
"Unemployment 4.779e-02 7.225e-03 6.616 3.70e-11 ***\n",
|
||||||
|
"Freedom_of_Expression 1.136e+00 1.185e-01 9.592 < 2e-16 ***\n",
|
||||||
|
"---\n",
|
||||||
|
"Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
|
||||||
|
"\n",
|
||||||
|
"(Dispersion parameter for Negative Binomial(0.4174) family taken to be 1)\n",
|
||||||
|
"\n",
|
||||||
|
" Null deviance: 4369.5 on 2619 degrees of freedom\n",
|
||||||
|
"Residual deviance: 3164.7 on 2613 degrees of freedom\n",
|
||||||
|
"AIC: 25106\n",
|
||||||
|
"\n",
|
||||||
|
"Number of Fisher Scoring iterations: 1\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" Theta: 0.4174 \n",
|
||||||
|
" Std. Err.: 0.0107 \n",
|
||||||
|
"\n",
|
||||||
|
" 2 x log-likelihood: -25090.2560 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df_model <- na.omit(df[, c(\"Number_of_Events\", \"Number_of_Flights\", \"HDI\",\n",
|
||||||
|
" \"Served_Population\", \"Land_area\", \n",
|
||||||
|
" \"Unemployment\", \"Freedom_of_Expression\")])\n",
|
||||||
|
"\n",
|
||||||
|
"nb_model <- glm.nb(Number_of_Events ~ ., data = df_model, control = glm.control(maxit = 100))\n",
|
||||||
|
"\n",
|
||||||
|
"summary(nb_model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"id": "32cdb5b8",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "r"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"fitting null model for pseudo-r2\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<style>\n",
|
||||||
|
".dl-inline {width: auto; margin:0; padding: 0}\n",
|
||||||
|
".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
|
||||||
|
".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
|
||||||
|
".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
|
||||||
|
"</style><dl class=dl-inline><dt>llh</dt><dd>-12545.1280487307</dd><dt>llhNull</dt><dd>-13056.4659219827</dd><dt>G2</dt><dd>1022.67574650416</dd><dt>McFadden</dt><dd>0.0391635743016152</dd><dt>r2ML</dt><dd>0.323169396916811</dd><dt>r2CU</dt><dd>0.323184565266037</dd></dl>\n"
|
||||||
|
],
|
||||||
|
"text/latex": [
|
||||||
|
"\\begin{description*}\n",
|
||||||
|
"\\item[llh] -12545.1280487307\n",
|
||||||
|
"\\item[llhNull] -13056.4659219827\n",
|
||||||
|
"\\item[G2] 1022.67574650416\n",
|
||||||
|
"\\item[McFadden] 0.0391635743016152\n",
|
||||||
|
"\\item[r2ML] 0.323169396916811\n",
|
||||||
|
"\\item[r2CU] 0.323184565266037\n",
|
||||||
|
"\\end{description*}\n"
|
||||||
|
],
|
||||||
|
"text/markdown": [
|
||||||
|
"llh\n",
|
||||||
|
": -12545.1280487307llhNull\n",
|
||||||
|
": -13056.4659219827G2\n",
|
||||||
|
": 1022.67574650416McFadden\n",
|
||||||
|
": 0.0391635743016152r2ML\n",
|
||||||
|
": 0.323169396916811r2CU\n",
|
||||||
|
": 0.323184565266037\n",
|
||||||
|
"\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" llh llhNull G2 McFadden r2ML \n",
|
||||||
|
"-1.254513e+04 -1.305647e+04 1.022676e+03 3.916357e-02 3.231694e-01 \n",
|
||||||
|
" r2CU \n",
|
||||||
|
" 3.231846e-01 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pR2(nb_model)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "R",
|
||||||
|
"language": "R",
|
||||||
|
"name": "ir"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": "r",
|
||||||
|
"file_extension": ".r",
|
||||||
|
"mimetype": "text/x-r-source",
|
||||||
|
"name": "R",
|
||||||
|
"pygments_lexer": "r",
|
||||||
|
"version": "4.5.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
2675
clean_data.csv
Normal file
2675
clean_data.csv
Normal file
File diff suppressed because it is too large
Load Diff
4688
clean_data.ipynb
Normal file
4688
clean_data.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
BIN
final_paper.odt
Normal file
BIN
final_paper.odt
Normal file
Binary file not shown.
BIN
final_paper.pdf
Normal file
BIN
final_paper.pdf
Normal file
Binary file not shown.
134
haversine.py
Normal file
134
haversine.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
def assign_events_to_cities(
|
||||||
|
city_df, event_df, distance_threshold_km, device,
|
||||||
|
city_lat='Latitude', city_lon='Longitude', city_country='Country',
|
||||||
|
event_lat='Latitude', event_lon='Longitude', event_country='Country',
|
||||||
|
batch_size=10000
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Assign each event to the nearest city (within distance_threshold_km and same country),
|
||||||
|
and return city_df with a new 'Number_of_Events' column.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def to_rad_tensor(df, lat_col, lon_col):
|
||||||
|
return torch.deg2rad(torch.tensor(df[[lat_col, lon_col]].values, dtype=torch.float32, device=device))
|
||||||
|
|
||||||
|
def build_country_mask(src_countries, dst_countries):
|
||||||
|
src = np.array(src_countries)
|
||||||
|
dst = np.array(dst_countries)
|
||||||
|
return torch.tensor((src[:, None] == dst[None, :]), dtype=torch.bool, device=device)
|
||||||
|
|
||||||
|
def haversine_distance_matrix(src_rad, dst_rad):
|
||||||
|
src_lat, src_lon = src_rad[:, 0:1], src_rad[:, 1:2]
|
||||||
|
dst_lat, dst_lon = dst_rad[:, 0], dst_rad[:, 1]
|
||||||
|
dlat = dst_lat - src_lat
|
||||||
|
dlon = dst_lon - src_lon
|
||||||
|
a = torch.sin(dlat / 2) ** 2 + torch.cos(src_lat) * torch.cos(dst_lat) * torch.sin(dlon / 2) ** 2
|
||||||
|
c = 2 * torch.arcsin(torch.sqrt(a))
|
||||||
|
return 6371 * c # km
|
||||||
|
|
||||||
|
# Prepare coordinate tensors
|
||||||
|
event_coords = to_rad_tensor(event_df, event_lat, event_lon)
|
||||||
|
city_coords = to_rad_tensor(city_df, city_lat, city_lon)
|
||||||
|
country_mask = build_country_mask(event_df[event_country].values, city_df[city_country].values)
|
||||||
|
|
||||||
|
# Assignment arrays
|
||||||
|
min_dists, min_idxs = [], []
|
||||||
|
|
||||||
|
for start in tqdm(range(0, len(event_df), batch_size), desc="📍 Assigning events to cities"):
|
||||||
|
end = min(start + batch_size, len(event_df))
|
||||||
|
batch_coords = event_coords[start:end]
|
||||||
|
mask_batch = country_mask[start:end]
|
||||||
|
|
||||||
|
dist_matrix = haversine_distance_matrix(batch_coords, city_coords)
|
||||||
|
dist_matrix[~mask_batch] = 1e6 # mask out different countries
|
||||||
|
|
||||||
|
min_dist, min_idx = torch.min(dist_matrix, dim=1)
|
||||||
|
min_dists.append(min_dist)
|
||||||
|
min_idxs.append(min_idx)
|
||||||
|
|
||||||
|
min_dists = torch.cat(min_dists)
|
||||||
|
min_idxs = torch.cat(min_idxs)
|
||||||
|
valid_mask = min_dists <= distance_threshold_km
|
||||||
|
|
||||||
|
# Build result DataFrame
|
||||||
|
assigned_idxs = min_idxs[valid_mask].cpu().numpy()
|
||||||
|
result_df = city_df.copy()
|
||||||
|
result_df['Number_of_Events'] = 0
|
||||||
|
|
||||||
|
# Safe event assignment using .iloc to align correctly
|
||||||
|
counts = pd.Series(assigned_idxs).value_counts()
|
||||||
|
for idx, count in counts.items():
|
||||||
|
result_df.iloc[idx, result_df.columns.get_loc('Number_of_Events')] = count
|
||||||
|
|
||||||
|
return result_df
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_served_population(airports_df, cities_df, distance_threshold_km, device, batch_size=2000):
|
||||||
|
"""
|
||||||
|
Assign each city's population to the airport with the highest score:
|
||||||
|
score = Departures / (Distance_km + 1)
|
||||||
|
Only airports within 50 km and same country are considered.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Tensors
|
||||||
|
airport_coords = torch.tensor(airports_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device)
|
||||||
|
city_coords = torch.tensor(cities_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device)
|
||||||
|
city_pops = torch.tensor(cities_df['Population'].values, dtype=torch.float32).to(device)
|
||||||
|
airport_departures = torch.tensor(airports_df['Number_of_Flights'].values, dtype=torch.float32).to(device)
|
||||||
|
|
||||||
|
airport_countries = airports_df['Country'].values
|
||||||
|
city_countries = cities_df['Country'].values
|
||||||
|
|
||||||
|
def haversine_batch(lat1, lon1, lat2, lon2):
|
||||||
|
lat1_rad = torch.deg2rad(lat1)
|
||||||
|
lon1_rad = torch.deg2rad(lon1)
|
||||||
|
lat2_rad = torch.deg2rad(lat2)
|
||||||
|
lon2_rad = torch.deg2rad(lon2)
|
||||||
|
dlon = lon2_rad - lon1_rad
|
||||||
|
dlat = lat2_rad - lat1_rad
|
||||||
|
a = torch.sin(dlat / 2) ** 2 + torch.cos(lat1_rad) * torch.cos(lat2_rad) * torch.sin(dlon / 2) ** 2
|
||||||
|
c = 2 * torch.asin(torch.sqrt(a))
|
||||||
|
return 6371 * c
|
||||||
|
|
||||||
|
assigned_pop = torch.zeros(len(airport_coords), device=device)
|
||||||
|
|
||||||
|
for start in tqdm(range(0, len(city_coords), batch_size), desc="📊 Assigning using weighted score"):
|
||||||
|
end = min(start + batch_size, len(city_coords))
|
||||||
|
batch_coords = city_coords[start:end]
|
||||||
|
batch_pops = city_pops[start:end]
|
||||||
|
batch_countries = city_countries[start:end]
|
||||||
|
|
||||||
|
for i in range(batch_coords.shape[0]):
|
||||||
|
country = batch_countries[i]
|
||||||
|
matching_idx = [j for j, c in enumerate(airport_countries) if c == country]
|
||||||
|
|
||||||
|
if not matching_idx:
|
||||||
|
continue
|
||||||
|
|
||||||
|
subset_coords = airport_coords[matching_idx]
|
||||||
|
subset_deps = airport_departures[matching_idx]
|
||||||
|
lat1 = batch_coords[i, 0].unsqueeze(0)
|
||||||
|
lon1 = batch_coords[i, 1].unsqueeze(0)
|
||||||
|
|
||||||
|
dists = haversine_batch(
|
||||||
|
lat1, lon1,
|
||||||
|
subset_coords[:, 0], subset_coords[:, 1]
|
||||||
|
).squeeze()
|
||||||
|
|
||||||
|
in_range_mask = dists <= distance_threshold_km
|
||||||
|
if torch.any(in_range_mask):
|
||||||
|
dists_in_range = dists[in_range_mask]
|
||||||
|
deps_in_range = subset_deps[in_range_mask]
|
||||||
|
scores = deps_in_range / (dists_in_range + 1)
|
||||||
|
best_idx = torch.argmax(scores)
|
||||||
|
chosen_airport = torch.tensor(matching_idx)[in_range_mask][best_idx]
|
||||||
|
assigned_pop[chosen_airport] += batch_pops[i]
|
||||||
|
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return assigned_pop.cpu().numpy()
|
127
mappings.py
Normal file
127
mappings.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# This dictionary will be used to ensure standardized country naming conventions for each dataset.
|
||||||
|
country_name_map = {
|
||||||
|
'Puerto Rico' : 'United States',
|
||||||
|
'Guam' : 'United States',
|
||||||
|
'Netherlands Antilles' : 'Netherlands',
|
||||||
|
'Jersey' : 'United Kingdom',
|
||||||
|
'Greenland' : 'Netherlands',
|
||||||
|
'New Caledonia' : 'France',
|
||||||
|
'Guernsey' : 'United Kingdom',
|
||||||
|
'Congo (Kinshasa)' : 'Democratic Republic of the Congo',
|
||||||
|
'Congo (Brazzaville)' : 'Republic of the Congo',
|
||||||
|
'Reunion' : 'France',
|
||||||
|
"Martinique" : "France",
|
||||||
|
"Guadeloupe" : "France",
|
||||||
|
"French Guiana" : "France",
|
||||||
|
"Gibraltar" : "United Kingdom",
|
||||||
|
"French Polynesia" : "France",
|
||||||
|
"Isle of Man" : "United Kingdom",
|
||||||
|
"Bermuda" : "United Kingdom",
|
||||||
|
'Anguilla': 'United Kingdom',
|
||||||
|
'Saint Pierre and Miquelon': 'France',
|
||||||
|
'Wallis and Futuna': 'France',
|
||||||
|
'Aruba': 'Netherlands',
|
||||||
|
'Faroe Islands': 'Denmark',
|
||||||
|
'Cook Islands': 'New Zealand',
|
||||||
|
'Mayotte': 'France',
|
||||||
|
"Northern Mariana Islands" : "United States",
|
||||||
|
"Réunion" : "France",
|
||||||
|
'United States of America': 'United States',
|
||||||
|
'Türkiye' : 'Turkey',
|
||||||
|
'Czechia' : 'Czech Republic',
|
||||||
|
'Burma/Myanmar' : 'Burma',
|
||||||
|
'Ivory Coast' : "Cote d'Ivoire",
|
||||||
|
"Timor-Leste" : "East Timor",
|
||||||
|
"North Macedonia" : "Macedonia",
|
||||||
|
'Turkiye': 'Turkey',
|
||||||
|
'Russian Federation' : 'Russia',
|
||||||
|
"Taiwan, China" : "Taiwan",
|
||||||
|
"Venezuela, Bolivarian Rep. of" : "Venezuela",
|
||||||
|
'Korea, Rep.' : 'South Korea',
|
||||||
|
"Korea, Dem. People's Rep." : 'North Korea',
|
||||||
|
"Hong Kong SAR, China" : "Hong Kong",
|
||||||
|
"Macao SAR, China" : "Macau",
|
||||||
|
"Egypt, Arab Rep.": "Egypt",
|
||||||
|
"Viet Nam" : "Vietnam",
|
||||||
|
"South Sudan, The Republic of" : "South Sudan",
|
||||||
|
"Sudan, The Republic of" : "Sudan",
|
||||||
|
"Korea, Republic of" : "South Korea",
|
||||||
|
"Macedonia, The former Yugoslav Rep. of" : "Macedonia",
|
||||||
|
"Venezuela, RB" : "Venezuela",
|
||||||
|
"Bahamas, The" : "Bahamas",
|
||||||
|
"Iran, Islamic Rep." : "Iran",
|
||||||
|
"Congo, Democratic Republic of the" : "Democratic Republic of the Congo",
|
||||||
|
"Myanmar" : "Burma",
|
||||||
|
"Kyrgyz Republic" : "Kyrgyzstan",
|
||||||
|
"Yemen, Rep." : "Yemen",
|
||||||
|
"Congo, Dem. Rep." : "Democratic Republic of the Congo",
|
||||||
|
"Tanzania, United Republic of" : "Tanzania",
|
||||||
|
"Iran, Islamic Rep. of" : "Iran",
|
||||||
|
"Congo, Rep." : "Republic of the Congo",
|
||||||
|
"Lao PDR" : "Laos",
|
||||||
|
"Slovak Republic" :"Slovakia",
|
||||||
|
"Gambia, The" : "Gambia",
|
||||||
|
'Western Sahara': 'Morocco',
|
||||||
|
'Venezuela (Bolivarian Republic of)': 'Venezuela',
|
||||||
|
'Iran (Islamic Republic of)': 'Iran',
|
||||||
|
'Congo': 'Republic of the Congo',
|
||||||
|
'Congo (Democratic Republic of the)': 'Democratic Republic of the Congo',
|
||||||
|
'Lao People\'s Democratic Republic': 'Laos',
|
||||||
|
'Republic of Moldova': 'Moldova',
|
||||||
|
'Syrian Arab Republic': 'Syria',
|
||||||
|
'Tanzania (United Republic of)': 'Tanzania',
|
||||||
|
"Hong Kong, China (SAR)" : "Hong Kong",
|
||||||
|
"Hong Kong, China" : "Hong Kong",
|
||||||
|
"Korea (Democratic People's Rep. of)" : "North Korea",
|
||||||
|
"Korea (Republic of)" : "South Korea",
|
||||||
|
"Côte d'Ivoire" : "Cote d'Ivoire",
|
||||||
|
"Bolivia (Plurinational State of)" : "Bolivia",
|
||||||
|
"Moldova (Republic of)" : "Moldova",
|
||||||
|
"Brunei Darussalam" : "Brunei",
|
||||||
|
"Cabo Verde" : "Cape Verde",
|
||||||
|
'Turkiye': 'Turkey', # Added both versions to be safe
|
||||||
|
'Russian Federation' : 'Russia',
|
||||||
|
'Korea, Rep.' : 'South Korea',
|
||||||
|
"Korea, Dem. People's Rep." : 'North Korea',
|
||||||
|
"Hong Kong SAR, China" : "Hong Kong",
|
||||||
|
"Macao SAR, China" : "Macau",
|
||||||
|
"Czechia" : "Czech Republic",
|
||||||
|
"Egypt, Arab Rep.": "Egypt",
|
||||||
|
"Viet Nam" : "Vietnam",
|
||||||
|
"Venezuela, RB" : "Venezuela",
|
||||||
|
"Bahamas, The" : "Bahamas",
|
||||||
|
"Iran, Islamic Rep." : "Iran",
|
||||||
|
"Myanmar" : "Burma",
|
||||||
|
"Kyrgyz Republic" : "Kyrgyzstan",
|
||||||
|
"Yemen, Rep." : "Yemen",
|
||||||
|
"Congo, Dem. Rep." : "Democratic Republic of the Congo",
|
||||||
|
"Congo, Rep." : "Republic of the Congo",
|
||||||
|
"Lao PDR" : "Laos",
|
||||||
|
"North Macedonia" : "Macedonia",
|
||||||
|
"Slovak Republic" :"Slovakia",
|
||||||
|
"Gambia, The" : "Gambia",
|
||||||
|
'Western Sahara': 'Morocco',
|
||||||
|
'Republic of Korea': 'South Korea',
|
||||||
|
'Czechia': 'Czech Republic',
|
||||||
|
'Viet Nam': 'Vietnam',
|
||||||
|
'Venezuela (Bolivarian Republic of)': 'Venezuela',
|
||||||
|
'Iran (Islamic Republic of)': 'Iran',
|
||||||
|
'Myanmar': 'Burma',
|
||||||
|
'Congo': 'Republic of the Congo',
|
||||||
|
'Congo (Democratic Republic of the)': 'Democratic Republic of the Congo',
|
||||||
|
'Lao People\'s Democratic Republic': 'Laos',
|
||||||
|
'Republic of Moldova': 'Moldova',
|
||||||
|
'Syrian Arab Republic': 'Syria',
|
||||||
|
'Tanzania (United Republic of)': 'Tanzania',
|
||||||
|
"Hong Kong, China (SAR)" : "Hong Kong",
|
||||||
|
"Korea (Democratic People's Rep. of)" : "North Korea",
|
||||||
|
"Korea (Republic of)" : "South Korea",
|
||||||
|
"Türkiye" : "Turkey",
|
||||||
|
"Côte d'Ivoire" : "Cote d'Ivoire",
|
||||||
|
"Bolivia (Plurinational State of)" : "Bolivia",
|
||||||
|
"North Macedonia" : "Macedonia",
|
||||||
|
"Moldova (Republic of)" : "Moldova",
|
||||||
|
"Brunei Darussalam" : "Brunei",
|
||||||
|
"Timor-Leste" : "East Timor",
|
||||||
|
"Cabo Verde" : "Cape Verde"
|
||||||
|
}
|
Reference in New Issue
Block a user