我目前正在研究这个数据排序问题,我应该(除了一堆其他标准)仅包含至少 3 行的参与者的数据行。
我使用的方法(第 9-16 行和第 37-40 行)似乎无法排除至少有 3 行数据的参与者的第一行数据。 这里肯定有逻辑错误,有人能找出我的想法哪里出了问题吗?非常感谢,非常感谢!
import csv
def preprocessing(raw_filename, output_filename):
data=open(raw_filename)
new=open(output_filename, "a")
health_write=csv.writer(new)
reader=csv.DictReader(data)
health_write.writerow(reader.fieldnames)
counting_part={}
for row in reader:
checker = False
participant_id=row['participant_ID']
if participant_id in counting_part:
counting_part[participant_id]+=1
else:
counting_part[participant_id]=1
x=row['timestamp'].split()[0]
if x.split('/')[-1] in ('2023'):
if float(row['accuracy']) <= 30:
if len(participant_id.split('-'))==5:
match=[]
for each in participant_id.split('-'):
match.append(len(each))
if match == [8, 4, 4, 4, 12]:
for elm in each:
if elm.islower() or elm.isdigit():
value_lat=float(row['double_latitude'])
value_long=float(row['double_longitude'])
if (value_lat >= -180 and value_lat
<= 180 and value_long >= -180
and value_long <=180):
if 0<=float(row
['double_altitude'])<=1000:
if row['provider'] in ['gps',
'network']:
checker = True
if checker and counting_part[participant_id] >= 3:
for participant_key, count in counting_part.items():
if participant_id == participant_key:
health_write.writerow(row.values())
data.close()
new.close()
preprocessing("example_raw.csv", "my_output.csv")
row_id,timestamp,participant_ID,double_latitude,double_longitude,double_altitude,provider,accuracy
1,24/1/2023 1:05,2c9a7613-84f7-4c6d-af82,-37.80014035,144.963727,171.117,gps,41
2,24/1/2023 3:38,2c9a7613-84f7-4c6d-af82-9f54c8be2c1a,-37.80389128,144.9634838,143.159,gps,12.7445
3,24/1/2022 3:44,2c9a7613-84f7-4c6d-af82-9f54c8be2c1a,-37.80438867,144.9645387,19.95410156,gps,57.0303
4,24/1/2023 4:02,2c9a7613-84f7-4c6d-af82-9f54c8be2c1a,-37.80366035,144.9679344,56.83,gps,6.76879
5,24/1/2023 4:05,2c9a7613-84f7-4c6d-af82-9f54c8be2c1a,-37.80139125,144.9683195,55.13,gps,4.98383
6,15/7/2022 4:08,a7f3e14c-5db5-47e7-86c9-81f60e727d61,-37.80512032,144.9676758,4.723449707,gps,6.10221
7,15/7/2023 7:30,a7f3e14c-5db5-47e7-86c9-81f60e727d61,-37.80480437,144.9675091,95.83,gps,15
由于您在读取行并按参与者计算行的同一循环中写入输出文件,因此在到达参与者的第三行之前,您不会为参与者写入任何内容。
不要只使用计数器,而是列出每个参与者的行。然后在循环结束时,您可以遍历所有参与者,如果行数至少为 3,则写入他们的所有行。
每个参与者字典还可以包含
checker
变量,如果任何行的所有条件都为 true,则该变量设置为 True
。
from collections import defaultdict
def preprocessing(raw_filename, output_filename):
with open(raw_filename) as data
reader=csv.DictReader(data)
counting_part = defaultdict(lambda: {"checker": False, "rows": []})
for row in reader:
participant_id=row['participant_ID']
counting_part[participant_id]['rows'].append(row.values())
x=row['timestamp'].split()[0]
if x.split('/')[-1] in ('2023'):
if float(row['accuracy']) <= 30:
if len(participant_id.split('-'))==5:
match=[]
for each in participant_id.split('-'):
match.append(len(each))
if match == [8, 4, 4, 4, 12]:
for elm in each:
if elm.islower() or elm.isdigit():
value_lat=float(row['double_latitude'])
value_long=float(row['double_longitude'])
if (value_lat >= -180 and value_lat
<= 180 and value_long >= -180
and value_long <=180):
if 0<=float(row
['double_altitude'])<=1000:
if row['provider'] in ['gps',
'network']:
counting_part[participant_id]['checker'] = True
with open(output_filename, "a") as new:
health_write=csv.writer(new)
health_write.writerow(reader.fieldnames)
for item in counting_part.values():
if item['checker'] and len(item['rows']) > 3:
for row in item['rows']:
health_write.writerow(row)