mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2025-06-27 02:39:58 +00:00
163 lines
4.8 KiB
Python
163 lines
4.8 KiB
Python
"""
|
||
adapted from chemdataextractor.text.normalize
|
||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||
Tools for normalizing text.
|
||
https://github.com/mcs07/ChemDataExtractor
|
||
:copyright: Copyright 2016 by Matt Swain.
|
||
:license: MIT
|
||
|
||
Permission is hereby granted, free of charge, to any person obtaining
|
||
a copy of this software and associated documentation files (the
|
||
'Software'), to deal in the Software without restriction, including
|
||
without limitation the rights to use, copy, modify, merge, publish,
|
||
distribute, sublicense, and/or sell copies of the Software, and to
|
||
permit persons to whom the Software is furnished to do so, subject to
|
||
the following conditions:
|
||
|
||
The above copyright notice and this permission notice shall be
|
||
included in all copies or substantial portions of the Software.
|
||
|
||
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
"""
|
||
|
||
#: Control characters.
|
||
CONTROLS = {
|
||
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
|
||
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
|
||
}
|
||
# There are further control characters, but they are instead replaced with a space by unicode normalization
|
||
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'
|
||
|
||
|
||
#: Hyphen and dash characters.
|
||
HYPHENS = {
|
||
'-', # \u002d Hyphen-minus
|
||
'‐', # \u2010 Hyphen
|
||
'‑', # \u2011 Non-breaking hyphen
|
||
'⁃', # \u2043 Hyphen bullet
|
||
'‒', # \u2012 figure dash
|
||
'–', # \u2013 en dash
|
||
'—', # \u2014 em dash
|
||
'―', # \u2015 horizontal bar
|
||
}
|
||
|
||
#: Minus characters.
|
||
MINUSES = {
|
||
'-', # \u002d Hyphen-minus
|
||
'−', # \u2212 Minus
|
||
'-', # \uff0d Full-width Hyphen-minus
|
||
'⁻', # \u207b Superscript minus
|
||
}
|
||
|
||
#: Plus characters.
|
||
PLUSES = {
|
||
'+', # \u002b Plus
|
||
'+', # \uff0b Full-width Plus
|
||
'⁺', # \u207a Superscript plus
|
||
}
|
||
|
||
#: Slash characters.
|
||
SLASHES = {
|
||
'/', # \u002f Solidus
|
||
'⁄', # \u2044 Fraction slash
|
||
'∕', # \u2215 Division slash
|
||
}
|
||
|
||
#: Tilde characters.
|
||
TILDES = {
|
||
'~', # \u007e Tilde
|
||
'˜', # \u02dc Small tilde
|
||
'⁓', # \u2053 Swung dash
|
||
'∼', # \u223c Tilde operator #in mbert vocab
|
||
'∽', # \u223d Reversed tilde
|
||
'∿', # \u223f Sine wave
|
||
'〜', # \u301c Wave dash #in mbert vocab
|
||
'~', # \uff5e Full-width tilde #in mbert vocab
|
||
}
|
||
|
||
#: Apostrophe characters.
|
||
APOSTROPHES = {
|
||
"'", # \u0027
|
||
'’', # \u2019
|
||
'՚', # \u055a
|
||
'Ꞌ', # \ua78b
|
||
'ꞌ', # \ua78c
|
||
''', # \uff07
|
||
}
|
||
|
||
#: Single quote characters.
|
||
SINGLE_QUOTES = {
|
||
"'", # \u0027
|
||
'‘', # \u2018
|
||
'’', # \u2019
|
||
'‚', # \u201a
|
||
'‛', # \u201b
|
||
|
||
}
|
||
|
||
#: Double quote characters.
|
||
DOUBLE_QUOTES = {
|
||
'"', # \u0022
|
||
'“', # \u201c
|
||
'”', # \u201d
|
||
'„', # \u201e
|
||
'‟', # \u201f
|
||
}
|
||
|
||
#: Accent characters.
|
||
ACCENTS = {
|
||
'`', # \u0060
|
||
'´', # \u00b4
|
||
}
|
||
|
||
#: Prime characters.
|
||
PRIMES = {
|
||
'′', # \u2032
|
||
'″', # \u2033
|
||
'‴', # \u2034
|
||
'‵', # \u2035
|
||
'‶', # \u2036
|
||
'‷', # \u2037
|
||
'⁗', # \u2057
|
||
}
|
||
|
||
#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
|
||
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
|
||
|
||
def normalize_text(text: str):
|
||
for control in CONTROLS:
|
||
text = text.replace(control, '')
|
||
text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
|
||
|
||
for hyphen in HYPHENS | MINUSES:
|
||
text = text.replace(hyphen, '-')
|
||
text = text.replace('\u00ad', '')
|
||
|
||
for double_quote in DOUBLE_QUOTES:
|
||
text = text.replace(double_quote, '"') # \u0022
|
||
for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
|
||
text = text.replace(single_quote, "'") # \u0027
|
||
text = text.replace('′', "'") # \u2032 prime
|
||
text = text.replace('‵', "'") # \u2035 reversed prime
|
||
text = text.replace('″', "''") # \u2033 double prime
|
||
text = text.replace('‶', "''") # \u2036 reversed double prime
|
||
text = text.replace('‴', "'''") # \u2034 triple prime
|
||
text = text.replace('‷', "'''") # \u2037 reversed triple prime
|
||
text = text.replace('⁗', "''''") # \u2057 quadruple prime
|
||
|
||
text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026
|
||
|
||
for slash in SLASHES:
|
||
text = text.replace(slash, '/')
|
||
|
||
#for tilde in TILDES:
|
||
# text = text.replace(tilde, '~')
|
||
|
||
return text
|