mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2025-07-07 17:12:49 +00:00
163 lines
4.8 KiB
Python
163 lines
4.8 KiB
Python
![]() |
"""
|
|||
|
adapted from chemdataextractor.text.normalize
|
|||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|||
|
Tools for normalizing text.
|
|||
|
https://github.com/mcs07/ChemDataExtractor
|
|||
|
:copyright: Copyright 2016 by Matt Swain.
|
|||
|
:license: MIT
|
|||
|
|
|||
|
Permission is hereby granted, free of charge, to any person obtaining
|
|||
|
a copy of this software and associated documentation files (the
|
|||
|
'Software'), to deal in the Software without restriction, including
|
|||
|
without limitation the rights to use, copy, modify, merge, publish,
|
|||
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|||
|
permit persons to whom the Software is furnished to do so, subject to
|
|||
|
the following conditions:
|
|||
|
|
|||
|
The above copyright notice and this permission notice shall be
|
|||
|
included in all copies or substantial portions of the Software.
|
|||
|
|
|||
|
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
|||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|||
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|||
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|||
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|||
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|||
|
"""
|
|||
|
|
|||
|
#: Control characters.
|
|||
|
CONTROLS = {
|
|||
|
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
|
|||
|
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
|
|||
|
}
|
|||
|
# There are further control characters, but they are instead replaced with a space by unicode normalization
|
|||
|
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'
|
|||
|
|
|||
|
|
|||
|
#: Hyphen and dash characters.
|
|||
|
HYPHENS = {
|
|||
|
'-', # \u002d Hyphen-minus
|
|||
|
'‐', # \u2010 Hyphen
|
|||
|
'‑', # \u2011 Non-breaking hyphen
|
|||
|
'⁃', # \u2043 Hyphen bullet
|
|||
|
'‒', # \u2012 figure dash
|
|||
|
'–', # \u2013 en dash
|
|||
|
'—', # \u2014 em dash
|
|||
|
'―', # \u2015 horizontal bar
|
|||
|
}
|
|||
|
|
|||
|
#: Minus characters.
|
|||
|
MINUSES = {
|
|||
|
'-', # \u002d Hyphen-minus
|
|||
|
'−', # \u2212 Minus
|
|||
|
'-', # \uff0d Full-width Hyphen-minus
|
|||
|
'⁻', # \u207b Superscript minus
|
|||
|
}
|
|||
|
|
|||
|
#: Plus characters.
|
|||
|
PLUSES = {
|
|||
|
'+', # \u002b Plus
|
|||
|
'+', # \uff0b Full-width Plus
|
|||
|
'⁺', # \u207a Superscript plus
|
|||
|
}
|
|||
|
|
|||
|
#: Slash characters.
|
|||
|
SLASHES = {
|
|||
|
'/', # \u002f Solidus
|
|||
|
'⁄', # \u2044 Fraction slash
|
|||
|
'∕', # \u2215 Division slash
|
|||
|
}
|
|||
|
|
|||
|
#: Tilde characters.
|
|||
|
TILDES = {
|
|||
|
'~', # \u007e Tilde
|
|||
|
'˜', # \u02dc Small tilde
|
|||
|
'⁓', # \u2053 Swung dash
|
|||
|
'∼', # \u223c Tilde operator #in mbert vocab
|
|||
|
'∽', # \u223d Reversed tilde
|
|||
|
'∿', # \u223f Sine wave
|
|||
|
'〜', # \u301c Wave dash #in mbert vocab
|
|||
|
'~', # \uff5e Full-width tilde #in mbert vocab
|
|||
|
}
|
|||
|
|
|||
|
#: Apostrophe characters.
|
|||
|
APOSTROPHES = {
|
|||
|
"'", # \u0027
|
|||
|
'’', # \u2019
|
|||
|
'՚', # \u055a
|
|||
|
'Ꞌ', # \ua78b
|
|||
|
'ꞌ', # \ua78c
|
|||
|
''', # \uff07
|
|||
|
}
|
|||
|
|
|||
|
#: Single quote characters.
|
|||
|
SINGLE_QUOTES = {
|
|||
|
"'", # \u0027
|
|||
|
'‘', # \u2018
|
|||
|
'’', # \u2019
|
|||
|
'‚', # \u201a
|
|||
|
'‛', # \u201b
|
|||
|
|
|||
|
}
|
|||
|
|
|||
|
#: Double quote characters.
|
|||
|
DOUBLE_QUOTES = {
|
|||
|
'"', # \u0022
|
|||
|
'“', # \u201c
|
|||
|
'”', # \u201d
|
|||
|
'„', # \u201e
|
|||
|
'‟', # \u201f
|
|||
|
}
|
|||
|
|
|||
|
#: Accent characters.
|
|||
|
ACCENTS = {
|
|||
|
'`', # \u0060
|
|||
|
'´', # \u00b4
|
|||
|
}
|
|||
|
|
|||
|
#: Prime characters.
|
|||
|
PRIMES = {
|
|||
|
'′', # \u2032
|
|||
|
'″', # \u2033
|
|||
|
'‴', # \u2034
|
|||
|
'‵', # \u2035
|
|||
|
'‶', # \u2036
|
|||
|
'‷', # \u2037
|
|||
|
'⁗', # \u2057
|
|||
|
}
|
|||
|
|
|||
|
#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
|
|||
|
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
|
|||
|
|
|||
|
def normalize_text(text: str):
|
|||
|
for control in CONTROLS:
|
|||
|
text = text.replace(control, '')
|
|||
|
text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
|
|||
|
|
|||
|
for hyphen in HYPHENS | MINUSES:
|
|||
|
text = text.replace(hyphen, '-')
|
|||
|
text = text.replace('\u00ad', '')
|
|||
|
|
|||
|
for double_quote in DOUBLE_QUOTES:
|
|||
|
text = text.replace(double_quote, '"') # \u0022
|
|||
|
for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
|
|||
|
text = text.replace(single_quote, "'") # \u0027
|
|||
|
text = text.replace('′', "'") # \u2032 prime
|
|||
|
text = text.replace('‵', "'") # \u2035 reversed prime
|
|||
|
text = text.replace('″', "''") # \u2033 double prime
|
|||
|
text = text.replace('‶', "''") # \u2036 reversed double prime
|
|||
|
text = text.replace('‴', "'''") # \u2034 triple prime
|
|||
|
text = text.replace('‷', "'''") # \u2037 reversed triple prime
|
|||
|
text = text.replace('⁗', "''''") # \u2057 quadruple prime
|
|||
|
|
|||
|
text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026
|
|||
|
|
|||
|
for slash in SLASHES:
|
|||
|
text = text.replace(slash, '/')
|
|||
|
|
|||
|
#for tilde in TILDES:
|
|||
|
# text = text.replace(tilde, '~')
|
|||
|
|
|||
|
return text
|