import torch import numpy as np import pandas as pd from tqdm import tqdm def assign_events_to_cities( city_df, event_df, distance_threshold_km, device, city_lat='Latitude', city_lon='Longitude', city_country='Country', event_lat='Latitude', event_lon='Longitude', event_country='Country', batch_size=10000 ): """ Assign each event to the nearest city (within distance_threshold_km and same country), and return city_df with a new 'Number_of_Events' column. """ def to_rad_tensor(df, lat_col, lon_col): return torch.deg2rad(torch.tensor(df[[lat_col, lon_col]].values, dtype=torch.float32, device=device)) def build_country_mask(src_countries, dst_countries): src = np.array(src_countries) dst = np.array(dst_countries) return torch.tensor((src[:, None] == dst[None, :]), dtype=torch.bool, device=device) def haversine_distance_matrix(src_rad, dst_rad): src_lat, src_lon = src_rad[:, 0:1], src_rad[:, 1:2] dst_lat, dst_lon = dst_rad[:, 0], dst_rad[:, 1] dlat = dst_lat - src_lat dlon = dst_lon - src_lon a = torch.sin(dlat / 2) ** 2 + torch.cos(src_lat) * torch.cos(dst_lat) * torch.sin(dlon / 2) ** 2 c = 2 * torch.arcsin(torch.sqrt(a)) return 6371 * c # km # Prepare coordinate tensors event_coords = to_rad_tensor(event_df, event_lat, event_lon) city_coords = to_rad_tensor(city_df, city_lat, city_lon) country_mask = build_country_mask(event_df[event_country].values, city_df[city_country].values) # Assignment arrays min_dists, min_idxs = [], [] for start in tqdm(range(0, len(event_df), batch_size), desc="📍 Assigning events to cities"): end = min(start + batch_size, len(event_df)) batch_coords = event_coords[start:end] mask_batch = country_mask[start:end] dist_matrix = haversine_distance_matrix(batch_coords, city_coords) dist_matrix[~mask_batch] = 1e6 # mask out different countries min_dist, min_idx = torch.min(dist_matrix, dim=1) min_dists.append(min_dist) min_idxs.append(min_idx) min_dists = torch.cat(min_dists) min_idxs = torch.cat(min_idxs) valid_mask = min_dists <= distance_threshold_km # Build result DataFrame assigned_idxs = min_idxs[valid_mask].cpu().numpy() result_df = city_df.copy() result_df['Number_of_Events'] = 0 # Safe event assignment using .iloc to align correctly counts = pd.Series(assigned_idxs).value_counts() for idx, count in counts.items(): result_df.iloc[idx, result_df.columns.get_loc('Number_of_Events')] = count return result_df def calculate_served_population(airports_df, cities_df, distance_threshold_km, device, batch_size=2000): """ Assign each city's population to the airport with the highest score: score = Departures / (Distance_km + 1) Only airports within 50 km and same country are considered. """ # Tensors airport_coords = torch.tensor(airports_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device) city_coords = torch.tensor(cities_df[['Latitude', 'Longitude']].values, dtype=torch.float32).to(device) city_pops = torch.tensor(cities_df['Population'].values, dtype=torch.float32).to(device) airport_departures = torch.tensor(airports_df['Number_of_Flights'].values, dtype=torch.float32).to(device) airport_countries = airports_df['Country'].values city_countries = cities_df['Country'].values def haversine_batch(lat1, lon1, lat2, lon2): lat1_rad = torch.deg2rad(lat1) lon1_rad = torch.deg2rad(lon1) lat2_rad = torch.deg2rad(lat2) lon2_rad = torch.deg2rad(lon2) dlon = lon2_rad - lon1_rad dlat = lat2_rad - lat1_rad a = torch.sin(dlat / 2) ** 2 + torch.cos(lat1_rad) * torch.cos(lat2_rad) * torch.sin(dlon / 2) ** 2 c = 2 * torch.asin(torch.sqrt(a)) return 6371 * c assigned_pop = torch.zeros(len(airport_coords), device=device) for start in tqdm(range(0, len(city_coords), batch_size), desc="📊 Assigning using weighted score"): end = min(start + batch_size, len(city_coords)) batch_coords = city_coords[start:end] batch_pops = city_pops[start:end] batch_countries = city_countries[start:end] for i in range(batch_coords.shape[0]): country = batch_countries[i] matching_idx = [j for j, c in enumerate(airport_countries) if c == country] if not matching_idx: continue subset_coords = airport_coords[matching_idx] subset_deps = airport_departures[matching_idx] lat1 = batch_coords[i, 0].unsqueeze(0) lon1 = batch_coords[i, 1].unsqueeze(0) dists = haversine_batch( lat1, lon1, subset_coords[:, 0], subset_coords[:, 1] ).squeeze() in_range_mask = dists <= distance_threshold_km if torch.any(in_range_mask): dists_in_range = dists[in_range_mask] deps_in_range = subset_deps[in_range_mask] scores = deps_in_range / (dists_in_range + 1) best_idx = torch.argmax(scores) chosen_airport = torch.tensor(matching_idx)[in_range_mask][best_idx] assigned_pop[chosen_airport] += batch_pops[i] torch.cuda.empty_cache() return assigned_pop.cpu().numpy()