Merge pull request #182 from rasbt/refine-duplicate-search

Refine duplicate search utility function
This commit is contained in:
Sebastian Raschka 2024-05-26 16:19:17 -04:00 committed by GitHub
commit ffe8bb7489
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 89 additions and 37 deletions

View File

@ -11,8 +11,8 @@ pip install -r requirements-extra.txt
 
## Finding Near-duplicates ### Finding Near Duplicates
The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example, The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
@ -27,13 +27,9 @@ scikit-learn version: 1.3.1
================================================== ==================================================
Searching 'instruction' for duplicates ... Searching 'instruction' for duplicates ...
================================================== ==================================================
Duplicate pair found with similarity 0.85: Duplicate pair found with similarity 0.94:
1. Determine the state of matter for helium at room temperature.
2. Determine the state of matter for nitrogen at room temperature.
Duplicate pair found with similarity 0.98:
1. Edit the following sentence to make it more formal. 1. Edit the following sentence to make it more formal.
2. Edit the sentence to make it more formal. 2. Edit the sentence to make it more formal.
@ -41,28 +37,21 @@ Duplicate pair found with similarity 1.00:
1. Name a dwarf planet in our solar system. 1. Name a dwarf planet in our solar system.
2. Name a dwarf planet in our solar system. 2. Name a dwarf planet in our solar system.
Duplicate pair found with similarity 0.88: Duplicate pair found with similarity 0.91:
1. Change the sentences from active voice to passive voice. 1. Change the sentences from active voice to passive voice.
2. Change the sentence from passive to active voice. 2. Change the sentence from passive to active voice.
================================================== ==================================================
Searching 'input' for duplicates ... Searching 'input' for duplicates ...
================================================== ==================================================
Duplicate pair found with similarity 0.88: No duplicates found
1.
2. She said, "I am tired."
================================================== ==================================================
Searching 'output' for duplicates ... Searching 'output' for duplicates ...
================================================== ==================================================
Duplicate pair found with similarity 0.82:
1. Helium is in a gaseous state at room temperature.
2. Nitrogen is in a gaseous state at room temperature.
Duplicate pair found with similarity 1.00: Duplicate pair found with similarity 1.00:
1. One dwarf planet in our solar system is Pluto. 1. One dwarf planet in our solar system is Pluto.
2. One dwarf planet in our solar system is Pluto. 2. One dwarf planet in our solar system is Pluto.
@ -70,17 +59,22 @@ Duplicate pair found with similarity 1.00:
``` ```
 
You can use the `--threshold` setting with a value between 0 and 1 to decrease or increase the sensitivity.
The default threshold is 0.9.
   
## Creating Passive Voice Entries ## Creating Passive Voice Entries
- The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below - The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below
```python ```python
{ {
'instruction': 'Identify the verb in the following sentence', 'instruction': 'Identify the verb in the following sentence',
'input': 'The cat sleeps on the couch.', 'input': 'The cat sleeps on the couch.',
'output': 'The verb in the sentence is "sleeps."', 'output': 'The verb in the sentence is "sleeps."',
'output_2': 'The sentence is "sleeps."' # <---- Newly created entry 'output_2': 'The sentence is "sleeps."' # <---- Newly created entry
} }
``` ```

View File

@ -6,6 +6,7 @@
import argparse import argparse
import json import json
import re
from sklearn import __version__ as sklearn_version from sklearn import __version__ as sklearn_version
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
@ -29,18 +30,27 @@ example_data = [
] ]
def find_near_duplicates(json_data, threshold=0.8, key="instruction"): def preprocess_text(text):
# Lowercase the text
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
return text
def find_near_duplicates(json_data, threshold=0.75, key="instruction"):
"""The higher the threshold, the more similar the texts have to be to match""" """The higher the threshold, the more similar the texts have to be to match"""
# Extract instructions # Extract instructions
text = [item[key] for item in json_data if item[key]] text = [preprocess_text(item[key]) for item in json_data if item[key]]
near_duplicates = [] near_duplicates = []
indices_to_remove = set()
if not text: if not text:
return near_duplicates return {}, near_duplicates
# Vectorize the text data # Vectorize the text data
vectorizer = TfidfVectorizer(stop_words=None) vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(text) tfidf_matrix = vectorizer.fit_transform(text)
# Compute cosine similarity between each pair of entries # Compute cosine similarity between each pair of entries
@ -51,18 +61,29 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
for i in range(len(cos_sim_matrix)): for i in range(len(cos_sim_matrix)):
for j in range(i+1, len(cos_sim_matrix)): for j in range(i+1, len(cos_sim_matrix)):
if cos_sim_matrix[i, j] > threshold: if cos_sim_matrix[i, j] > threshold:
if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <= 1:
continue
near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j])) near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
if key in ("input", "output"): # Don't remove duplicates based on the instruction
indices_to_remove.add(j) # Mark the second entry for removal
return near_duplicates # Remove the near-duplicate entries
filtered_json_data = [item for index, item in enumerate(json_data) if index not in indices_to_remove]
return filtered_json_data, near_duplicates
def find_and_print_new_duplicates(json_data): def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, threshold=0.75):
""" """
Searches each key in the first JSON object for duplicates across a list of JSON objects. Searches each key in the first JSON object for duplicates across a list of JSON objects.
Prints the duplicates if found. Prints the duplicates if found.
""" """
for key in json_data[0].keys(): for key in json_data[0].keys():
near_duplicates = find_near_duplicates(json_data, key=key)
if remove_duplicates:
json_data, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
else:
_, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
separator = 50 * '=' separator = 50 * '='
print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}") print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
if not near_duplicates: if not near_duplicates:
@ -73,6 +94,7 @@ def find_and_print_new_duplicates(json_data):
f"Duplicate pair found with similarity {dup[2]:.2f}:\n" f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
f"1. {dup[0][key]}\n2. {dup[1][key]}\n" f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
) )
return json_data
if __name__ == "__main__": if __name__ == "__main__":
@ -84,7 +106,35 @@ if __name__ == "__main__":
type=str, type=str,
help=("Path to the dataset JSON file") help=("Path to the dataset JSON file")
) )
parser.add_argument(
"--threshold",
type=float,
default=0.9,
help=("A sensitivity threshold between 0 and 1 where 1 is strictest")
)
parser.add_argument(
"--remove_duplicates",
action='store_true',
default=False,
help=(
"Removes duplicates based on the 'input' or 'output' keys "
" (but not the 'instruction') and saves the cleaned JSON file as --json_output_file"
)
)
parser.add_argument(
"--json_output_file",
type=str,
help=("Path to the dataset JSON file")
)
args = parser.parse_args() args = parser.parse_args()
if args.remove_duplicates and not args.json_output_file:
raise ValueError(
"Provide an output file via --json_output_file "
"to save the cleaned JSON data."
)
if not args.json_file: if not args.json_file:
json_data = example_data json_data = example_data
@ -92,4 +142,12 @@ if __name__ == "__main__":
with open(args.json_file, "r") as file: with open(args.json_file, "r") as file:
json_data = json.load(file) json_data = json.load(file)
find_and_print_new_duplicates(json_data) json_data = find_print_and_remove_near_duplicates(
json_data=json_data,
remove_duplicates=args.remove_duplicates,
threshold=args.threshold
)
if args.remove_duplicates:
with open(args.json_output_file, "w") as file:
json.dump(json_data, file, indent=4)