style and requirements

This commit is contained in:
rasbt 2024-05-25 11:38:55 -05:00
parent abdb2fc61f
commit b5dd2259b8
No known key found for this signature in database
GPG Key ID: 3C6E5C7C075611DB
3 changed files with 33 additions and 8 deletions

View File

@ -2,6 +2,14 @@
This folder contains utility code that can be used for preparing an instruction dataset.
Install the additional package requirements via:
```bash
pip install -r requirements-extra.txt
```
### Finding near duplicates

View File

@ -12,11 +12,19 @@ from sklearn.metrics.pairwise import cosine_similarity
# Sample JSON dataset
example_data = [
{"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
{"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
# Add other entries...
{"instruction": "What is the capital of Italy?",
"input": "", "output": "The capital of Italy is Rome."
},
{"instruction": "What's the capital city of Italy?",
"input": "", "output": "The capital city is Rome."
},
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'",
"input": "", "output": "The verb is 'sleeps'."
},
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.",
"input": "", "output": "The verb in the sentence is \"sleeps.\""
},
# ...
]
@ -48,15 +56,22 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
def find_and_print_new_duplicates(json_data):
"""
Searches each key in the first JSON object for duplicates across a list of JSON objects.
Prints the duplicates if found.
"""
for key in json_data[0].keys():
near_duplicates = find_near_duplicates(json_data, key=key)
print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
separator = 50 * '='
print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
if not near_duplicates:
print("No duplicates found")
else:
for dup in near_duplicates:
print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
f"1. {dup[0][key]}\n2. {dup[1][key]}\n")
print(
f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
)
if __name__ == "__main__":

View File

@ -0,0 +1,2 @@
openai
scikit-learn