feat(ingest/excel): Add Excel Source

This commit is contained in:
Michael Minichino 2025-04-19 14:39:05 -05:00
parent 71b05c6020
commit d5eb2e7d6a
11 changed files with 44 additions and 1 deletions

View File

@ -198,7 +198,7 @@ class ExcelFile:
row_count,
column_count,
metadata,
sheet.title,
sheet.title.strip(),
)
def find_header_row(self, rows: List[List[Any]]) -> int:

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,43 @@
import io
from datahub.ingestion.source.excel.excel_file import ExcelFile
from datahub.ingestion.source.excel.report import ExcelSourceReport
def test_sample_files(pytestconfig):
file_names = [
("file_1.xlsx", "Monthly Reporting", 1, 5, 4, 17),
("file_1.xlsx", "Dec", 1, 4, 3, 14),
("file_1.xlsx", "Jan", 1, 5, 4, 14),
("file_1.xlsx", "Feb", 1, 5, 4, 14),
("file_2.xlsx", "Test Group Reporting ", 1, 19, 18, 46),
("file_3.xlsx", "Sheet1", 1, 5, 4, 209),
("file_4.xlsx", "in", 1, 3, 2, 252),
("file_5.xlsx", "Test1_Test", 4, 8, 4, 24),
("file_6.xlsx", "Test2_Test", 2, 6, 4, 24),
("file_7.xlsx", "12345678 (Current Month)", 1, 4, 3, 68),
("file_8.xlsx", "Test3_Test", 4, 8, 4, 24),
("file_9.xlsx", "Business Report", 6, 11, 5, 5),
]
test_resources_dir = pytestconfig.rootpath / "tests/unit/excel"
for file_name, sheet, header, footer, rows, columns in file_names:
sample_file = test_resources_dir / f"data/{file_name}"
report = ExcelSourceReport()
assert sample_file.exists()
with open(sample_file, "rb") as f:
file_content = f.read()
bytes_io = io.BytesIO(file_content)
xls = ExcelFile(file_name, bytes_io, report)
result = xls.load_workbook()
assert result is True
table = xls.get_table(sheet)
assert table.header_row == header
assert table.footer_row == footer
assert table.row_count == rows
assert table.column_count == columns