# Copyright (c) OpenMMLab. All rights reserved. import argparse import json from datetime import datetime def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('src_file', help='source file path') parser.add_argument('dst_file', help='destination file path') parser.add_argument( '--categories', nargs='+', default=['cs.AI', 'cs.CL', 'cs.CV'], help='target categories') parser.add_argument( '--start-date', default='2020-01-01', help='start date (format: YYYY-MM-DD)') args = parser.parse_args() return args def has_intersection(list1, list2): set1 = set(list1) set2 = set(list2) return len(set1.intersection(set2)) > 0 def read_json_file(file_path): data = [] with open(file_path) as file: for line in file: try: json_data = json.loads(line) data.append(json_data) except json.JSONDecodeError: print(f'Failed to parse line: {line}') return data def main(): args = parse_args() json_data = read_json_file(args.src_file) from_time = datetime.strptime(args.start_date, '%Y-%m-%d') filtered_data = [ item for item in json_data if has_intersection(args.categories, item['categories'].split()) and datetime.strptime(item['update_date'], '%Y-%m-%d') >= from_time ] with open(args.dst_file, 'w') as file: json.dump(filtered_data, file) print(f'Save to {args.dst_file}\n{len(filtered_data)} items') if __name__ == '__main__': main()