style and requirements

This commit is contained in:
rasbt 2024-05-25 11:38:55 -05:00
parent abdb2fc61f
commit b5dd2259b8
No known key found for this signature in database
GPG Key ID: 3C6E5C7C075611DB
3 changed files with 33 additions and 8 deletions

View File

@ -2,6 +2,14 @@
This folder contains utility code that can be used for preparing an instruction dataset. This folder contains utility code that can be used for preparing an instruction dataset.
Install the additional package requirements via:
```bash
pip install -r requirements-extra.txt
```
### Finding near duplicates ### Finding near duplicates

View File

@ -12,11 +12,19 @@ from sklearn.metrics.pairwise import cosine_similarity
# Sample JSON dataset # Sample JSON dataset
example_data = [ example_data = [
{"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."}, {"instruction": "What is the capital of Italy?",
{"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."}, "input": "", "output": "The capital of Italy is Rome."
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."}, },
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""}, {"instruction": "What's the capital city of Italy?",
# Add other entries... "input": "", "output": "The capital city is Rome."
},
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'",
"input": "", "output": "The verb is 'sleeps'."
},
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.",
"input": "", "output": "The verb in the sentence is \"sleeps.\""
},
# ...
] ]
@ -48,15 +56,22 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
def find_and_print_new_duplicates(json_data): def find_and_print_new_duplicates(json_data):
"""
Searches each key in the first JSON object for duplicates across a list of JSON objects.
Prints the duplicates if found.
"""
for key in json_data[0].keys(): for key in json_data[0].keys():
near_duplicates = find_near_duplicates(json_data, key=key) near_duplicates = find_near_duplicates(json_data, key=key)
print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}") separator = 50 * '='
print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
if not near_duplicates: if not near_duplicates:
print("No duplicates found") print("No duplicates found")
else: else:
for dup in near_duplicates: for dup in near_duplicates:
print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n" print(
f"1. {dup[0][key]}\n2. {dup[1][key]}\n") f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -0,0 +1,2 @@
openai
scikit-learn