mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-03 06:13:14 +00:00
feat(ingest/datahub): use stream_results with mysql (#12278)
This commit is contained in:
parent
30a77c022a
commit
a06a229499
@ -461,7 +461,7 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
|
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
|
||||||
"mysql": mysql,
|
"mysql": mysql,
|
||||||
# mariadb should have same dependency as mysql
|
# mariadb should have same dependency as mysql
|
||||||
"mariadb": sql_common | {"pymysql>=1.0.2"},
|
"mariadb": sql_common | mysql,
|
||||||
"okta": {"okta~=1.7.0", "nest-asyncio"},
|
"okta": {"okta~=1.7.0", "nest-asyncio"},
|
||||||
"oracle": sql_common | {"oracledb"},
|
"oracle": sql_common | {"oracledb"},
|
||||||
"postgres": sql_common | postgres_common,
|
"postgres": sql_common | postgres_common,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
from typing import Optional, Set
|
from typing import Optional, Set
|
||||||
|
|
||||||
|
import pydantic
|
||||||
from pydantic import Field, root_validator
|
from pydantic import Field, root_validator
|
||||||
|
|
||||||
from datahub.configuration.common import AllowDenyPattern
|
from datahub.configuration.common import AllowDenyPattern
|
||||||
@ -119,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|||||||
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
|
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
|
||||||
)
|
)
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
@pydantic.validator("database_connection")
|
||||||
|
def validate_mysql_scheme(
|
||||||
|
cls, v: SQLAlchemyConnectionConfig
|
||||||
|
) -> SQLAlchemyConnectionConfig:
|
||||||
|
if "mysql" in v.scheme:
|
||||||
|
if v.scheme != "mysql+pymysql":
|
||||||
|
raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
|
||||||
|
return v
|
||||||
|
@ -151,8 +151,10 @@ class DataHubDatabaseReader:
|
|||||||
self, query: str, params: Dict[str, Any]
|
self, query: str, params: Dict[str, Any]
|
||||||
) -> Iterable[Dict[str, Any]]:
|
) -> Iterable[Dict[str, Any]]:
|
||||||
with self.engine.connect() as conn:
|
with self.engine.connect() as conn:
|
||||||
if self.engine.dialect.name == "postgresql":
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
||||||
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
||||||
|
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
||||||
|
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
||||||
conn = conn.execution_options(
|
conn = conn.execution_options(
|
||||||
stream_results=True,
|
stream_results=True,
|
||||||
yield_per=self.config.database_query_batch_size,
|
yield_per=self.config.database_query_batch_size,
|
||||||
@ -160,22 +162,6 @@ class DataHubDatabaseReader:
|
|||||||
result = conn.execute(query, params)
|
result = conn.execute(query, params)
|
||||||
for row in result:
|
for row in result:
|
||||||
yield dict(row)
|
yield dict(row)
|
||||||
elif self.engine.dialect.name == "mysql": # MySQL
|
|
||||||
import MySQLdb
|
|
||||||
|
|
||||||
with contextlib.closing(
|
|
||||||
conn.connection.cursor(MySQLdb.cursors.SSCursor)
|
|
||||||
) as cursor:
|
|
||||||
logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
|
|
||||||
cursor.execute(query, params)
|
|
||||||
|
|
||||||
columns = [desc[0] for desc in cursor.description]
|
|
||||||
while True:
|
|
||||||
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
||||||
if not rows:
|
|
||||||
break # Use break instead of return in generator
|
|
||||||
for row in rows:
|
|
||||||
yield dict(zip(columns, row))
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
||||||
|
|
||||||
|
@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|||||||
self._commit_progress(i)
|
self._commit_progress(i)
|
||||||
|
|
||||||
def _get_kafka_workunits(
|
def _get_kafka_workunits(
|
||||||
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
|
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
if self.config.kafka_connection is None:
|
if self.config.kafka_connection is None:
|
||||||
return
|
return
|
||||||
|
Loading…
x
Reference in New Issue
Block a user