mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-11 02:01:31 +00:00
Merge pull request #182 from rasbt/refine-duplicate-search
Refine duplicate search utility function
This commit is contained in:
commit
ffe8bb7489
@ -11,8 +11,8 @@ pip install -r requirements-extra.txt
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Finding Near-duplicates
|
### Finding Near Duplicates
|
||||||
|
|
||||||
The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
|
The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
|
||||||
|
|
||||||
@ -27,13 +27,9 @@ scikit-learn version: 1.3.1
|
|||||||
|
|
||||||
|
|
||||||
==================================================
|
==================================================
|
||||||
Searching 'instruction' for duplicates ...
|
Searching 'instruction' for duplicates ...
|
||||||
==================================================
|
==================================================
|
||||||
Duplicate pair found with similarity 0.85:
|
Duplicate pair found with similarity 0.94:
|
||||||
1. Determine the state of matter for helium at room temperature.
|
|
||||||
2. Determine the state of matter for nitrogen at room temperature.
|
|
||||||
|
|
||||||
Duplicate pair found with similarity 0.98:
|
|
||||||
1. Edit the following sentence to make it more formal.
|
1. Edit the following sentence to make it more formal.
|
||||||
2. Edit the sentence to make it more formal.
|
2. Edit the sentence to make it more formal.
|
||||||
|
|
||||||
@ -41,28 +37,21 @@ Duplicate pair found with similarity 1.00:
|
|||||||
1. Name a dwarf planet in our solar system.
|
1. Name a dwarf planet in our solar system.
|
||||||
2. Name a dwarf planet in our solar system.
|
2. Name a dwarf planet in our solar system.
|
||||||
|
|
||||||
Duplicate pair found with similarity 0.88:
|
Duplicate pair found with similarity 0.91:
|
||||||
1. Change the sentences from active voice to passive voice.
|
1. Change the sentences from active voice to passive voice.
|
||||||
2. Change the sentence from passive to active voice.
|
2. Change the sentence from passive to active voice.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
==================================================
|
==================================================
|
||||||
Searching 'input' for duplicates ...
|
Searching 'input' for duplicates ...
|
||||||
==================================================
|
==================================================
|
||||||
Duplicate pair found with similarity 0.88:
|
No duplicates found
|
||||||
1.
|
|
||||||
2. She said, "I am tired."
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
==================================================
|
==================================================
|
||||||
Searching 'output' for duplicates ...
|
Searching 'output' for duplicates ...
|
||||||
==================================================
|
==================================================
|
||||||
Duplicate pair found with similarity 0.82:
|
|
||||||
1. Helium is in a gaseous state at room temperature.
|
|
||||||
2. Nitrogen is in a gaseous state at room temperature.
|
|
||||||
|
|
||||||
Duplicate pair found with similarity 1.00:
|
Duplicate pair found with similarity 1.00:
|
||||||
1. One dwarf planet in our solar system is Pluto.
|
1. One dwarf planet in our solar system is Pluto.
|
||||||
2. One dwarf planet in our solar system is Pluto.
|
2. One dwarf planet in our solar system is Pluto.
|
||||||
@ -70,17 +59,22 @@ Duplicate pair found with similarity 1.00:
|
|||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
You can use the `--threshold` setting with a value between 0 and 1 to decrease or increase the sensitivity.
|
||||||
|
The default threshold is 0.9.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Creating Passive Voice Entries
|
## Creating Passive Voice Entries
|
||||||
|
|
||||||
- The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below
|
- The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below
|
||||||
|
|
||||||
```python
|
```python
|
||||||
{
|
{
|
||||||
'instruction': 'Identify the verb in the following sentence',
|
'instruction': 'Identify the verb in the following sentence',
|
||||||
'input': 'The cat sleeps on the couch.',
|
'input': 'The cat sleeps on the couch.',
|
||||||
'output': 'The verb in the sentence is "sleeps."',
|
'output': 'The verb in the sentence is "sleeps."',
|
||||||
'output_2': 'The sentence is "sleeps."' # <---- Newly created entry
|
'output_2': 'The sentence is "sleeps."' # <---- Newly created entry
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from sklearn import __version__ as sklearn_version
|
from sklearn import __version__ as sklearn_version
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
@ -29,18 +30,27 @@ example_data = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
|
def preprocess_text(text):
|
||||||
|
# Lowercase the text
|
||||||
|
text = text.lower()
|
||||||
|
# Remove punctuation
|
||||||
|
text = re.sub(r'[^\w\s]', '', text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def find_near_duplicates(json_data, threshold=0.75, key="instruction"):
|
||||||
"""The higher the threshold, the more similar the texts have to be to match"""
|
"""The higher the threshold, the more similar the texts have to be to match"""
|
||||||
|
|
||||||
# Extract instructions
|
# Extract instructions
|
||||||
text = [item[key] for item in json_data if item[key]]
|
text = [preprocess_text(item[key]) for item in json_data if item[key]]
|
||||||
near_duplicates = []
|
near_duplicates = []
|
||||||
|
indices_to_remove = set()
|
||||||
|
|
||||||
if not text:
|
if not text:
|
||||||
return near_duplicates
|
return {}, near_duplicates
|
||||||
|
|
||||||
# Vectorize the text data
|
# Vectorize the text data
|
||||||
vectorizer = TfidfVectorizer(stop_words=None)
|
vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
|
||||||
tfidf_matrix = vectorizer.fit_transform(text)
|
tfidf_matrix = vectorizer.fit_transform(text)
|
||||||
|
|
||||||
# Compute cosine similarity between each pair of entries
|
# Compute cosine similarity between each pair of entries
|
||||||
@ -51,18 +61,29 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
|
|||||||
for i in range(len(cos_sim_matrix)):
|
for i in range(len(cos_sim_matrix)):
|
||||||
for j in range(i+1, len(cos_sim_matrix)):
|
for j in range(i+1, len(cos_sim_matrix)):
|
||||||
if cos_sim_matrix[i, j] > threshold:
|
if cos_sim_matrix[i, j] > threshold:
|
||||||
|
if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <= 1:
|
||||||
|
continue
|
||||||
near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
|
near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
|
||||||
|
if key in ("input", "output"): # Don't remove duplicates based on the instruction
|
||||||
|
indices_to_remove.add(j) # Mark the second entry for removal
|
||||||
|
|
||||||
return near_duplicates
|
# Remove the near-duplicate entries
|
||||||
|
filtered_json_data = [item for index, item in enumerate(json_data) if index not in indices_to_remove]
|
||||||
|
|
||||||
|
return filtered_json_data, near_duplicates
|
||||||
|
|
||||||
|
|
||||||
def find_and_print_new_duplicates(json_data):
|
def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, threshold=0.75):
|
||||||
"""
|
"""
|
||||||
Searches each key in the first JSON object for duplicates across a list of JSON objects.
|
Searches each key in the first JSON object for duplicates across a list of JSON objects.
|
||||||
Prints the duplicates if found.
|
Prints the duplicates if found.
|
||||||
"""
|
"""
|
||||||
for key in json_data[0].keys():
|
for key in json_data[0].keys():
|
||||||
near_duplicates = find_near_duplicates(json_data, key=key)
|
|
||||||
|
if remove_duplicates:
|
||||||
|
json_data, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
|
||||||
|
else:
|
||||||
|
_, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
|
||||||
separator = 50 * '='
|
separator = 50 * '='
|
||||||
print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
|
print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
|
||||||
if not near_duplicates:
|
if not near_duplicates:
|
||||||
@ -73,6 +94,7 @@ def find_and_print_new_duplicates(json_data):
|
|||||||
f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
|
f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
|
||||||
f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
|
f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
|
||||||
)
|
)
|
||||||
|
return json_data
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -84,7 +106,35 @@ if __name__ == "__main__":
|
|||||||
type=str,
|
type=str,
|
||||||
help=("Path to the dataset JSON file")
|
help=("Path to the dataset JSON file")
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--threshold",
|
||||||
|
type=float,
|
||||||
|
default=0.9,
|
||||||
|
help=("A sensitivity threshold between 0 and 1 where 1 is strictest")
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--remove_duplicates",
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
help=(
|
||||||
|
"Removes duplicates based on the 'input' or 'output' keys "
|
||||||
|
" (but not the 'instruction') and saves the cleaned JSON file as --json_output_file"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--json_output_file",
|
||||||
|
type=str,
|
||||||
|
help=("Path to the dataset JSON file")
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.remove_duplicates and not args.json_output_file:
|
||||||
|
raise ValueError(
|
||||||
|
"Provide an output file via --json_output_file "
|
||||||
|
"to save the cleaned JSON data."
|
||||||
|
)
|
||||||
|
|
||||||
if not args.json_file:
|
if not args.json_file:
|
||||||
json_data = example_data
|
json_data = example_data
|
||||||
|
|
||||||
@ -92,4 +142,12 @@ if __name__ == "__main__":
|
|||||||
with open(args.json_file, "r") as file:
|
with open(args.json_file, "r") as file:
|
||||||
json_data = json.load(file)
|
json_data = json.load(file)
|
||||||
|
|
||||||
find_and_print_new_duplicates(json_data)
|
json_data = find_print_and_remove_near_duplicates(
|
||||||
|
json_data=json_data,
|
||||||
|
remove_duplicates=args.remove_duplicates,
|
||||||
|
threshold=args.threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.remove_duplicates:
|
||||||
|
with open(args.json_output_file, "w") as file:
|
||||||
|
json.dump(json_data, file, indent=4)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user