2025-05-28 14:12:16 +08:00

38 lines
1.1 KiB
Python

import jsonlines
import csv
import os
def load_jsonlines(file):
with jsonlines.open(file, 'r') as jsonl_f:
lst = [obj for obj in jsonl_f]
return lst
def save_file_jsonl(data, fp):
with jsonlines.open(fp, mode='w') as writer:
writer.write_all(data)
def save_tsv_dict(data, fp, fields):
# build dir
dir_path = os.path.dirname(fp)
os.makedirs(dir_path, exist_ok=True)
# writing to csv file
with open(fp, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fields, delimiter='\t',)
writer.writeheader()
writer.writerows(data)
def cost_esitmate(path):
corpus = load_jsonlines(os.path.join(path, "corpus.jsonl"))
queries = load_jsonlines(os.path.join(path, "queries.jsonl"))
num_corpus_words = 0
num_queries_words = 0
for item in tqdm(corpus):
num_corpus_words += len(item["text"].split(" "))
for item in tqdm(queries):
num_queries_words += len(item["text"].split(" "))
print(len(corpus))
print(len(queries))
print(num_corpus_words)
print(num_queries_words)