jobbert_knowledge_extraction / filter-faults.py
Robzy's picture
mismatch deletion
762e05d
raw
history blame contribute delete
959 Bytes
import json
def count_mismatch(file_path):
count_mismatch = 0
with open(file_path, 'r') as file:
for line_number, line in enumerate(file, start=1):
data = json.loads(line)
tokens, tags = data['tokens'], data['tags_knowledge']
if len(tokens) != len(tags):
count_mismatch += 1
return count_mismatch
def delete_mismatched_lines(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
with open(file_path, 'w') as file:
for line in lines:
data = json.loads(line)
tokens, tags = data['tokens'], data['tags_knowledge']
if len(tokens) == len(tags):
file.write(line)
if __name__ == "__main__":
file_path = 'data/tags-04-01-2025.jsonl'
count = count_mismatch(file_path)
if count > 0:
delete_mismatched_lines(file_path)
print(f"Deleted {count} mismatched lines.")