This commit is contained in:
2025-09-26 00:11:14 -05:00
commit 3f6cd4ba59
14 changed files with 688056 additions and 0 deletions

134
haversine.py Normal file
View File

@@ -0,0 +1,134 @@
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
def assign_events_to_cities(
city_df, event_df, distance_threshold_km, device,
city_lat='Latitude', city_lon='Longitude', city_country='Country',
event_lat='Latitude', event_lon='Longitude', event_country='Country',
batch_size=10000
):
"""
Assign each event to the nearest city (within distance_threshold_km and same country),
and return city_df with a new 'Number_of_Events' column.
"""
def to_rad_tensor(df, lat_col, lon_col):
return torch.deg2rad(torch.tensor(df[[lat_col, lon_col]].values, dtype=torch.float32, device=device))
def build_country_mask(src_countries, dst_countries):
src = np.array(src_countries)
dst = np.array(dst_countries)
return torch.tensor((src[:, None] == dst[None, :]), dtype=torch.bool, device=device)
def haversine_distance_matrix(src_rad, dst_rad):
src_lat, src_lon = src_rad[:, 0:1], src_rad[:, 1:2]
dst_lat, dst_lon = dst_rad[:, 0], dst_rad[:, 1]
dlat = dst_lat - src_lat
dlon = dst_lon - src_lon
a = torch.sin(dlat / 2) ** 2 + torch.cos(src_lat) * torch.cos(dst_lat) * torch.sin(dlon / 2) ** 2
c = 2 * torch.arcsin(torch.sqrt(a))
return 6371 * c # km
# Prepare coordinate tensors
event_coords = to_rad_tensor(event_df, event_lat, event_lon)
city_coords = to_rad_tensor(city_df, city_lat, city_lon)
country_mask = build_country_mask(event_df[event_country].values, city_df[city_country].values)
# Assignment arrays
min_dists, min_idxs = [], []
for start in tqdm(range(0, len(event_df), batch_size), desc="📍 Assigning events to cities"):
end = min(start + batch_size, len(event_df))
batch_coords = event_coords[start:end]
mask_batch = country_mask[start:end]
dist_matrix = haversine_distance_matrix(batch_coords, city_coords)
dist_matrix[~mask_batch] = 1e6 # mask out different countries
min_dist, min_idx = torch.min(dist_matrix, dim=1)
min_dists.append(min_dist)
min_idxs.append(min_idx)
min_dists = torch.cat(min_dists)
min_idxs = torch.cat(min_idxs)
valid_mask = min_dists <= distance_threshold_km
# Build result DataFrame
assigned_idxs = min_idxs[valid_mask].cpu().numpy()
result_df = city_df.copy()
result_df['Number_of_Events'] = 0
# Safe event assignment using .iloc to align correctly
counts = pd.Series(assigned_idxs).value_counts()
for idx, count in counts.items():
result_df.iloc[idx, result_df.columns.get_loc('Number_of_Events')] = count
return result_df
def calculate_served_population(airports_df, cities_df, distance_threshold_km, device, batch_size=2000):
"""
Assign each city's population to the airport with the highest score:
score = Departures / (Distance_km + 1)
Only airports within 50 km and same country are considered.
"""
# Tensors
airport_coords = torch.tensor(airports_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device)
city_coords = torch.tensor(cities_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device)
city_pops = torch.tensor(cities_df['Population'].values, dtype=torch.float32).to(device)
airport_departures = torch.tensor(airports_df['Number_of_Flights'].values, dtype=torch.float32).to(device)
airport_countries = airports_df['Country'].values
city_countries = cities_df['Country'].values
def haversine_batch(lat1, lon1, lat2, lon2):
lat1_rad = torch.deg2rad(lat1)
lon1_rad = torch.deg2rad(lon1)
lat2_rad = torch.deg2rad(lat2)
lon2_rad = torch.deg2rad(lon2)
dlon = lon2_rad - lon1_rad
dlat = lat2_rad - lat1_rad
a = torch.sin(dlat / 2) ** 2 + torch.cos(lat1_rad) * torch.cos(lat2_rad) * torch.sin(dlon / 2) ** 2
c = 2 * torch.asin(torch.sqrt(a))
return 6371 * c
assigned_pop = torch.zeros(len(airport_coords), device=device)
for start in tqdm(range(0, len(city_coords), batch_size), desc="📊 Assigning using weighted score"):
end = min(start + batch_size, len(city_coords))
batch_coords = city_coords[start:end]
batch_pops = city_pops[start:end]
batch_countries = city_countries[start:end]
for i in range(batch_coords.shape[0]):
country = batch_countries[i]
matching_idx = [j for j, c in enumerate(airport_countries) if c == country]
if not matching_idx:
continue
subset_coords = airport_coords[matching_idx]
subset_deps = airport_departures[matching_idx]
lat1 = batch_coords[i, 0].unsqueeze(0)
lon1 = batch_coords[i, 1].unsqueeze(0)
dists = haversine_batch(
lat1, lon1,
subset_coords[:, 0], subset_coords[:, 1]
).squeeze()
in_range_mask = dists <= distance_threshold_km
if torch.any(in_range_mask):
dists_in_range = dists[in_range_mask]
deps_in_range = subset_deps[in_range_mask]
scores = deps_in_range / (dists_in_range + 1)
best_idx = torch.argmax(scores)
chosen_airport = torch.tensor(matching_idx)[in_range_mask][best_idx]
assigned_pop[chosen_airport] += batch_pops[i]
torch.cuda.empty_cache()
return assigned_pop.cpu().numpy()