我的 df 看起来像这样:
纬度 | 经度 | 身份证 |
---|---|---|
-22.582779 | 29.080456 | 0 |
-22.582575 | 29.080794 | 1 |
41.758910 | -53.626698 | 2 |
17.758527 | -2.443443 | 3 |
-22.582699 | 29.080455 | 4 |
我想创建一个函数,它接受上面的 df ,并为每个记录计算计数和半径 100000 米内的相应 ID。
R = 6371000 #earths' radius
def haversine(lat1, lon1, lat2, lon2):
phi1, phi2 = radians(lat1), radians(lat2)
delta_phi = radians(lat2 - lat1)
delta_lambda = radians(lon2 - lon1)
a = sin(delta_phi / 2)**2 + cos(phi1) * cos(phi2) * sin(delta_lambda / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
return R * c
def lat_lon_to_cartesian(lat, lon):
lat, lon = np.radians(lat), np.radians(lon)
x = R * np.cos(lat) * np.cos(lon)
y = R * np.cos(lat) * np.sin(lon)
z = R * np.sin(lat)
return x, y, z
# Add Cartesian coordinates to the DataFrame
df['x'], df['y'], df['z'] = zip(*df.apply(lambda row: lat_lon_to_cartesian(row['lat'], row['lon']), axis=1))
# Build a KD-Tree for fast spatial indexing
tree = cKDTree(df[['x', 'y', 'z']])
# Function to find points within a 100000 meters radius using the KD-Tree
def points_within_radius_kdtree(lat, lon, radius=100000):
x, y, z = lat_lon_to_cartesian(lat, lon)
indices = tree.query_ball_point([x, y, z], radius / R) # Adjust radius for Cartesian space
nearby_points = df.iloc[indices].copy()
nearby_points['distance'] = nearby_points.apply(lambda row: haversine(lat, lon, row['lat'], row['lon']), axis=1)
return nearby_points[nearby_points['distance'] <= radius]
# Function to calculate the count and IDs of records within a 100000 meters radius for each record in the DataFrame
def calculate_nearby_counts(df, radius=100000):
counts = []
ids = []
for idx, row in df.iterrows():
nearby_points = points_within_radius_kdtree(row['lat'], row['lon'], radius)
count = len(nearby_points) - 1 # Subtract 1 to exclude the point itself
nearby_ids = nearby_points[nearby_points['ID'] != row['ID']]['ID'].tolist() # Exclude the point itself
counts.append(count)
ids.append(nearby_ids)
df['nearby_count'] = counts
df['nearby_ids'] = ids
return df
result_df = calculate_nearby_counts(df)
print(result_df)
上面的代码没有给我预期的结果。
我的独立输出是:
纬度 | 经度 | 身份证 | #100000米以内的计数 | 100000米以内的id |
---|---|---|---|---|
-22.582779 | 29.080456 | 0 | 2 | [1,4] |