mirror of
https://github.com/microsoft/markitdown.git
synced 2025-06-26 22:00:21 +00:00
* feat: Add CSV to Markdown table converter - Add new CsvConverter class to convert CSV files to Markdown tables\n- Support text/csv and application/csv MIME types\n- Preserve table structure with headers and data rows\n- Handle edge cases like empty cells and mismatched columns\n- Fix Azure Document Intelligence dependency handling\n- Register CsvConverter in MarkItDown class ---- Thanks also to @benny123tw who submitted a very similar PR in #1171
This commit is contained in:
parent
3fcd48cdfc
commit
8576f1d915
@ -41,6 +41,7 @@ from .converters import (
|
||||
ZipConverter,
|
||||
EpubConverter,
|
||||
DocumentIntelligenceConverter,
|
||||
CsvConverter,
|
||||
)
|
||||
|
||||
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||
@ -194,6 +195,7 @@ class MarkItDown:
|
||||
self.register_converter(PdfConverter())
|
||||
self.register_converter(OutlookMsgConverter())
|
||||
self.register_converter(EpubConverter())
|
||||
self.register_converter(CsvConverter())
|
||||
|
||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||
|
@ -22,6 +22,7 @@ from ._doc_intel_converter import (
|
||||
DocumentIntelligenceFileType,
|
||||
)
|
||||
from ._epub_converter import EpubConverter
|
||||
from ._csv_converter import CsvConverter
|
||||
|
||||
__all__ = [
|
||||
"PlainTextConverter",
|
||||
@ -43,4 +44,5 @@ __all__ = [
|
||||
"DocumentIntelligenceConverter",
|
||||
"DocumentIntelligenceFileType",
|
||||
"EpubConverter",
|
||||
"CsvConverter",
|
||||
]
|
||||
|
@ -0,0 +1,79 @@
|
||||
import sys
|
||||
import csv
|
||||
import io
|
||||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/csv",
|
||||
"application/csv",
|
||||
]
|
||||
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
||||
|
||||
|
||||
class CsvConverter(DocumentConverter):
|
||||
"""
|
||||
Converts CSV files to Markdown tables.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Read the file content
|
||||
if stream_info.charset:
|
||||
content = file_stream.read().decode(stream_info.charset)
|
||||
else:
|
||||
content = str(from_bytes(file_stream.read()).best())
|
||||
|
||||
# Parse CSV content
|
||||
reader = csv.reader(io.StringIO(content))
|
||||
rows = list(reader)
|
||||
|
||||
if not rows:
|
||||
return DocumentConverterResult(markdown="")
|
||||
|
||||
# Create markdown table
|
||||
markdown_table = []
|
||||
|
||||
# Add header row
|
||||
markdown_table.append("| " + " | ".join(rows[0]) + " |")
|
||||
|
||||
# Add separator row
|
||||
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
|
||||
|
||||
# Add data rows
|
||||
for row in rows[1:]:
|
||||
# Make sure row has the same number of columns as header
|
||||
while len(row) < len(rows[0]):
|
||||
row.append("")
|
||||
# Truncate if row has more columns than header
|
||||
row = row[: len(rows[0])]
|
||||
markdown_table.append("| " + " | ".join(row) + " |")
|
||||
|
||||
result = "\n".join(markdown_table)
|
||||
|
||||
return DocumentConverterResult(markdown=result)
|
@ -1,8 +1,7 @@
|
||||
import sys
|
||||
import re
|
||||
import os
|
||||
|
||||
from typing import BinaryIO, Any, List
|
||||
from typing import BinaryIO, Any, List, Optional, Union
|
||||
from enum import Enum
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
@ -26,6 +25,28 @@ except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
# Define these types for type hinting when the package is not available
|
||||
class AzureKeyCredential:
|
||||
pass
|
||||
|
||||
class TokenCredential:
|
||||
pass
|
||||
|
||||
class DocumentIntelligenceClient:
|
||||
pass
|
||||
|
||||
class AnalyzeDocumentRequest:
|
||||
pass
|
||||
|
||||
class AnalyzeResult:
|
||||
pass
|
||||
|
||||
class DocumentAnalysisFeature:
|
||||
pass
|
||||
|
||||
class DefaultAzureCredential:
|
||||
pass
|
||||
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
# This constant is a temporary fix until the bug is resolved.
|
||||
|
@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [
|
||||
charset="cp932",
|
||||
url=None,
|
||||
must_include=[
|
||||
"名前,年齢,住所",
|
||||
"佐藤太郎,30,東京",
|
||||
"三木英子,25,大阪",
|
||||
"髙橋淳,35,名古屋",
|
||||
"| 名前 | 年齢 | 住所 |",
|
||||
"| --- | --- | --- |",
|
||||
"| 佐藤太郎 | 30 | 東京 |",
|
||||
"| 三木英子 | 25 | 大阪 |",
|
||||
"| 髙橋淳 | 35 | 名古屋 |",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
|
Loading…
x
Reference in New Issue
Block a user