final

2025-09-26 00:11:14 -05:00
commit 3f6cd4ba59
14 changed files with 688056 additions and 0 deletions
--- a/haversine.py
+++ b/haversine.py
@@ -0,0 +1,134 @@
+import torch
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+def assign_events_to_cities(
+    city_df, event_df, distance_threshold_km, device,
+    city_lat='Latitude', city_lon='Longitude', city_country='Country',
+    event_lat='Latitude', event_lon='Longitude', event_country='Country',
+    batch_size=10000
+):
+    """
+    Assign each event to the nearest city (within distance_threshold_km and same country),
+    and return city_df with a new 'Number_of_Events' column.
+    """
+
+    def to_rad_tensor(df, lat_col, lon_col):
+        return torch.deg2rad(torch.tensor(df[[lat_col, lon_col]].values, dtype=torch.float32, device=device))
+
+    def build_country_mask(src_countries, dst_countries):
+        src = np.array(src_countries)
+        dst = np.array(dst_countries)
+        return torch.tensor((src[:, None] == dst[None, :]), dtype=torch.bool, device=device)
+
+    def haversine_distance_matrix(src_rad, dst_rad):
+        src_lat, src_lon = src_rad[:, 0:1], src_rad[:, 1:2]
+        dst_lat, dst_lon = dst_rad[:, 0], dst_rad[:, 1]
+        dlat = dst_lat - src_lat
+        dlon = dst_lon - src_lon
+        a = torch.sin(dlat / 2) ** 2 + torch.cos(src_lat) * torch.cos(dst_lat) * torch.sin(dlon / 2) ** 2
+        c = 2 * torch.arcsin(torch.sqrt(a))
+        return 6371 * c  # km
+
+    # Prepare coordinate tensors
+    event_coords = to_rad_tensor(event_df, event_lat, event_lon)
+    city_coords = to_rad_tensor(city_df, city_lat, city_lon)
+    country_mask = build_country_mask(event_df[event_country].values, city_df[city_country].values)
+
+    # Assignment arrays
+    min_dists, min_idxs = [], []
+
+    for start in tqdm(range(0, len(event_df), batch_size), desc="📍 Assigning events to cities"):
+        end = min(start + batch_size, len(event_df))
+        batch_coords = event_coords[start:end]
+        mask_batch = country_mask[start:end]
+
+        dist_matrix = haversine_distance_matrix(batch_coords, city_coords)
+        dist_matrix[~mask_batch] = 1e6  # mask out different countries
+
+        min_dist, min_idx = torch.min(dist_matrix, dim=1)
+        min_dists.append(min_dist)
+        min_idxs.append(min_idx)
+
+    min_dists = torch.cat(min_dists)
+    min_idxs = torch.cat(min_idxs)
+    valid_mask = min_dists <= distance_threshold_km
+
+    # Build result DataFrame
+    assigned_idxs = min_idxs[valid_mask].cpu().numpy()
+    result_df = city_df.copy()
+    result_df['Number_of_Events'] = 0
+
+    # Safe event assignment using .iloc to align correctly
+    counts = pd.Series(assigned_idxs).value_counts()
+    for idx, count in counts.items():
+        result_df.iloc[idx, result_df.columns.get_loc('Number_of_Events')] = count
+
+    return result_df
+
+
+def calculate_served_population(airports_df, cities_df, distance_threshold_km, device, batch_size=2000):
+    """
+    Assign each city's population to the airport with the highest score:
+        score = Departures / (Distance_km + 1)
+    Only airports within 50 km and same country are considered.
+    """
+
+    # Tensors
+    airport_coords = torch.tensor(airports_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device)
+    city_coords = torch.tensor(cities_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device)
+    city_pops = torch.tensor(cities_df['Population'].values, dtype=torch.float32).to(device)
+    airport_departures = torch.tensor(airports_df['Number_of_Flights'].values, dtype=torch.float32).to(device)
+
+    airport_countries = airports_df['Country'].values
+    city_countries = cities_df['Country'].values
+
+    def haversine_batch(lat1, lon1, lat2, lon2):
+        lat1_rad = torch.deg2rad(lat1)
+        lon1_rad = torch.deg2rad(lon1)
+        lat2_rad = torch.deg2rad(lat2)
+        lon2_rad = torch.deg2rad(lon2)
+        dlon = lon2_rad - lon1_rad
+        dlat = lat2_rad - lat1_rad
+        a = torch.sin(dlat / 2) ** 2 + torch.cos(lat1_rad) * torch.cos(lat2_rad) * torch.sin(dlon / 2) ** 2
+        c = 2 * torch.asin(torch.sqrt(a))
+        return 6371 * c
+
+    assigned_pop = torch.zeros(len(airport_coords), device=device)
+
+    for start in tqdm(range(0, len(city_coords), batch_size), desc="📊 Assigning using weighted score"):
+        end = min(start + batch_size, len(city_coords))
+        batch_coords = city_coords[start:end]
+        batch_pops = city_pops[start:end]
+        batch_countries = city_countries[start:end]
+
+        for i in range(batch_coords.shape[0]):
+            country = batch_countries[i]
+            matching_idx = [j for j, c in enumerate(airport_countries) if c == country]
+
+            if not matching_idx:
+                continue
+
+            subset_coords = airport_coords[matching_idx]
+            subset_deps = airport_departures[matching_idx]
+            lat1 = batch_coords[i, 0].unsqueeze(0)
+            lon1 = batch_coords[i, 1].unsqueeze(0)
+
+            dists = haversine_batch(
+                lat1, lon1,
+                subset_coords[:, 0], subset_coords[:, 1]
+            ).squeeze()
+
+            in_range_mask = dists <= distance_threshold_km
+            if torch.any(in_range_mask):
+                dists_in_range = dists[in_range_mask]
+                deps_in_range = subset_deps[in_range_mask]
+                scores = deps_in_range / (dists_in_range + 1)
+                best_idx = torch.argmax(scores)
+                chosen_airport = torch.tensor(matching_idx)[in_range_mask][best_idx]
+                assigned_pop[chosen_airport] += batch_pops[i]
+
+        torch.cuda.empty_cache()
+
+    return assigned_pop.cpu().numpy()