fix: prevent document processing failures from UTF-8 surrogate characters

- Change sanitize_text_for_encoding to fail-fast instead of returning error placeholders
- Add strict UTF-8 cleaning pipeline to entity/relationship extraction
- Skip problematic entities/relationships instead of corrupting data

Fixes document processing crashes when encountering surrogate characters (U+D800-U+DFFF)
This commit is contained in:
yangdx 2025-08-27 23:52:39 +08:00
parent 4dfbe5e2db
commit 99e28e815b
2 changed files with 135 additions and 110 deletions

View File

@ -31,6 +31,7 @@ from .utils import (
pick_by_vector_similarity, pick_by_vector_similarity,
process_chunks_unified, process_chunks_unified,
build_file_path, build_file_path,
sanitize_text_for_encoding,
) )
from .base import ( from .base import (
BaseGraphStorage, BaseGraphStorage,
@ -318,34 +319,35 @@ async def _handle_single_entity_extraction(
if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]: if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
return None return None
# Clean and validate entity name try:
entity_name = clean_str(record_attributes[1]).strip() # Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
if not entity_name: entity_name = sanitize_text_for_encoding(record_attributes[1])
logger.warning(
f"Entity extraction error: empty entity name in: {record_attributes}"
)
return None
# Normalize entity name # Step 2: HTML and control character cleaning
entity_name = clean_str(entity_name).strip()
# Step 3: Business logic normalization
entity_name = normalize_extracted_info(entity_name, is_entity=True) entity_name = normalize_extracted_info(entity_name, is_entity=True)
# Check if entity name became empty after normalization # Validate entity name after all cleaning steps
if not entity_name or not entity_name.strip(): if not entity_name or not entity_name.strip():
logger.warning( logger.warning(
f"Entity extraction error: entity name became empty after normalization. Original: '{record_attributes[1]}'" f"Entity extraction error: entity name became empty after cleaning. Original: '{record_attributes[1]}'"
) )
return None return None
# Clean and validate entity type # Process entity type with same cleaning pipeline
entity_type = clean_str(record_attributes[2]).strip('"') entity_type = sanitize_text_for_encoding(record_attributes[2])
entity_type = clean_str(entity_type).strip('"')
if not entity_type.strip() or entity_type.startswith('("'): if not entity_type.strip() or entity_type.startswith('("'):
logger.warning( logger.warning(
f"Entity extraction error: invalid entity type in: {record_attributes}" f"Entity extraction error: invalid entity type in: {record_attributes}"
) )
return None return None
# Clean and validate description # Process entity description with same cleaning pipeline
entity_description = clean_str(record_attributes[3]) entity_description = sanitize_text_for_encoding(record_attributes[3])
entity_description = clean_str(entity_description)
entity_description = normalize_extracted_info(entity_description) entity_description = normalize_extracted_info(entity_description)
if not entity_description.strip(): if not entity_description.strip():
@ -362,6 +364,17 @@ async def _handle_single_entity_extraction(
file_path=file_path, file_path=file_path,
) )
except ValueError as e:
logger.error(
f"Entity extraction failed due to encoding issues in chunk {chunk_key}: {e}"
)
return None
except Exception as e:
logger.error(
f"Entity extraction failed with unexpected error in chunk {chunk_key}: {e}"
)
return None
async def _handle_single_relationship_extraction( async def _handle_single_relationship_extraction(
record_attributes: list[str], record_attributes: list[str],
@ -370,24 +383,31 @@ async def _handle_single_relationship_extraction(
): ):
if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]: if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
return None return None
# add this record as edge
source = clean_str(record_attributes[1])
target = clean_str(record_attributes[2])
# Normalize source and target entity names try:
# Process source and target entities with strict cleaning pipeline
# Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
source = sanitize_text_for_encoding(record_attributes[1])
# Step 2: HTML and control character cleaning
source = clean_str(source)
# Step 3: Business logic normalization
source = normalize_extracted_info(source, is_entity=True) source = normalize_extracted_info(source, is_entity=True)
# Same pipeline for target entity
target = sanitize_text_for_encoding(record_attributes[2])
target = clean_str(target)
target = normalize_extracted_info(target, is_entity=True) target = normalize_extracted_info(target, is_entity=True)
# Check if source or target became empty after normalization # Validate entity names after all cleaning steps
if not source or not source.strip(): if not source or not source.strip():
logger.warning( logger.warning(
f"Relationship extraction error: source entity became empty after normalization. Original: '{record_attributes[1]}'" f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'"
) )
return None return None
if not target or not target.strip(): if not target or not target.strip():
logger.warning( logger.warning(
f"Relationship extraction error: target entity became empty after normalization. Original: '{record_attributes[2]}'" f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'"
) )
return None return None
@ -397,12 +417,15 @@ async def _handle_single_relationship_extraction(
) )
return None return None
edge_description = clean_str(record_attributes[3]) # Process relationship description with same cleaning pipeline
edge_description = sanitize_text_for_encoding(record_attributes[3])
edge_description = clean_str(edge_description)
edge_description = normalize_extracted_info(edge_description) edge_description = normalize_extracted_info(edge_description)
edge_keywords = normalize_extracted_info( # Process keywords with same cleaning pipeline
clean_str(record_attributes[4]), is_entity=True edge_keywords = sanitize_text_for_encoding(record_attributes[4])
) edge_keywords = clean_str(edge_keywords)
edge_keywords = normalize_extracted_info(edge_keywords, is_entity=True)
edge_keywords = edge_keywords.replace("", ",") edge_keywords = edge_keywords.replace("", ",")
edge_source_id = chunk_key edge_source_id = chunk_key
@ -411,6 +434,7 @@ async def _handle_single_relationship_extraction(
if is_float_regex(record_attributes[-1].strip('"').strip("'")) if is_float_regex(record_attributes[-1].strip('"').strip("'"))
else 1.0 else 1.0
) )
return dict( return dict(
src_id=source, src_id=source,
tgt_id=target, tgt_id=target,
@ -421,6 +445,17 @@ async def _handle_single_relationship_extraction(
file_path=file_path, file_path=file_path,
) )
except ValueError as e:
logger.error(
f"Relationship extraction failed due to encoding issues in chunk {chunk_key}: {e}"
)
return None
except Exception as e:
logger.error(
f"Relationship extraction failed with unexpected error in chunk {chunk_key}: {e}"
)
return None
async def _rebuild_knowledge_from_chunks( async def _rebuild_knowledge_from_chunks(
entities_to_rebuild: dict[str, set[str]], entities_to_rebuild: dict[str, set[str]],

View File

@ -1577,7 +1577,7 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
"""Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters. """Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters.
This function handles: This function handles:
- Surrogate characters (the main cause of the encoding error) - Surrogate characters (the main cause of encoding errors)
- Other invalid Unicode sequences - Other invalid Unicode sequences
- Control characters that might cause issues - Control characters that might cause issues
- Whitespace trimming - Whitespace trimming
@ -1588,6 +1588,9 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
Returns: Returns:
Sanitized text that can be safely encoded as UTF-8 Sanitized text that can be safely encoded as UTF-8
Raises:
ValueError: When text contains uncleanable encoding issues that cannot be safely processed
""" """
if not isinstance(text, str): if not isinstance(text, str):
return str(text) return str(text)
@ -1636,34 +1639,21 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
return sanitized return sanitized
except UnicodeEncodeError as e: except UnicodeEncodeError as e:
logger.warning( # Critical change: Don't return placeholder, raise exception for caller to handle
f"Text sanitization: UnicodeEncodeError encountered, applying aggressive cleaning: {str(e)[:100]}" error_msg = f"Text contains uncleanable UTF-8 encoding issues: {str(e)[:100]}"
) logger.error(f"Text sanitization failed: {error_msg}")
raise ValueError(error_msg) from e
# Aggressive fallback: encode with error handling
try:
# Use 'replace' error handling to substitute problematic characters
safe_bytes = text.encode("utf-8", errors="replace")
sanitized = safe_bytes.decode("utf-8")
# Additional cleanup
sanitized = re.sub(
r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
)
return sanitized
except Exception as fallback_error:
logger.error(
f"Text sanitization: Aggressive fallback failed: {str(fallback_error)}"
)
# Last resort: return a safe placeholder
return f"[TEXT_ENCODING_ERROR: {len(text)} characters]"
except Exception as e: except Exception as e:
logger.error(f"Text sanitization: Unexpected error: {str(e)}") logger.error(f"Text sanitization: Unexpected error: {str(e)}")
# Return original text if no encoding issues detected # For other exceptions, if no encoding issues detected, return original text
try:
text.encode("utf-8")
return text return text
except UnicodeEncodeError:
raise ValueError(
f"Text sanitization failed with unexpected error: {str(e)}"
) from e
def check_storage_env_vars(storage_name: str) -> None: def check_storage_env_vars(storage_name: str) -> None: