open-metadata
diff --git a/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/azure_blob.py‎
Lines changed: 3 additions & 3 deletions b/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/azure_blob.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py‎
Lines changed: 9 additions & 4 deletions b/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py‎
Lines changed: 3 additions & 3 deletions b/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py‎
Lines changed: 13 additions & 5 deletions b/‎ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎ingestion/src/metadata/ingestion/source/database/datalake/metadata.py‎
Lines changed: 8 additions & 6 deletions b/‎ingestion/src/metadata/ingestion/source/database/datalake/metadata.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎ingestion/src/metadata/readers/dataframe/avro.py‎
Lines changed: 9 additions & 11 deletions b/‎ingestion/src/metadata/readers/dataframe/avro.py‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎ingestion/src/metadata/readers/dataframe/base.py‎
Lines changed: 21 additions & 20 deletions b/‎ingestion/src/metadata/readers/dataframe/base.py‎
Lines changed: 21 additions & 20 deletions
diff --git a/‎ingestion/src/metadata/readers/dataframe/dsv.py‎
Lines changed: 26 additions & 6 deletions b/‎ingestion/src/metadata/readers/dataframe/dsv.py‎
Lines changed: 26 additions & 6 deletions
diff --git a/‎ingestion/src/metadata/readers/dataframe/json.py‎
Lines changed: 13 additions & 4 deletions b/‎ingestion/src/metadata/readers/dataframe/json.py‎
Lines changed: 13 additions & 4 deletions
@@ -13,7 +13,7 @@
 Datalake Azure Blob Client
 """
 from functools import partial
-from typing import Callable, Iterable, Optional, Set
+from typing import Callable, Iterable, Optional, Set, Tuple
 
 from azure.storage.blob import BlobServiceClient
 
@@ -62,7 +62,7 @@ def get_table_names(
         bucket_name: str,
         prefix: Optional[str],
         skip_cold_storage: bool = False,
-    ) -> Iterable[str]:
+    ) -> Iterable[Tuple[str, Optional[int]]]:
         container_client = self._client.get_container_client(bucket_name)
 
         for file in container_client.list_blobs(name_starts_with=prefix or None):
@@ -74,7 +74,7 @@ def get_table_names(
                         f"(blob_tier: {blob_tier})"
                     )
                     continue
-            yield file.name
+            yield file.name, getattr(file, "size", None)
 
     def close(self, service_connection):
         self._client.close()
 
@@ -13,19 +13,24 @@
 Datalake Base Client
 """
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Iterable, Optional
+from typing import Any, Callable, Iterable, Optional, Tuple
 
 
 class DatalakeBaseClient(ABC):
     """Base DL client implementation"""
 
-    def __init__(self, client: Any, **kwargs):
+    def __init__(self, client: Any, session: Any = None, **kwargs):
         self._client = client
+        self._session = session
 
     @property
     def client(self) -> Any:
         return self._client
 
+    @property
+    def session(self) -> Any:
+        return self._session
+
     @classmethod
     @abstractmethod
     def from_config(cls, config) -> "DatalakeBaseClient":
@@ -49,8 +54,8 @@ def get_table_names(
         bucket_name: str,
         prefix: Optional[str],
         skip_cold_storage: bool = False,
-    ) -> Iterable[str]:
-        """Returns the Table names, based on the underlying client."""
+    ) -> Iterable[Tuple[str, Optional[int]]]:
+        """Returns (key, file_size_bytes) tuples. Size may be None if unavailable."""
 
     @abstractmethod
     def close(self, service_connection):
 
@@ -15,7 +15,7 @@
 import os
 from copy import deepcopy
 from functools import partial
-from typing import Callable, Iterable, List, Optional, Set
+from typing import Callable, Iterable, List, Optional, Set, Tuple
 
 from google.cloud import storage
 
@@ -117,7 +117,7 @@ def get_table_names(
         bucket_name: str,
         prefix: Optional[str],
         skip_cold_storage: bool = False,
-    ) -> Iterable[str]:
+    ) -> Iterable[Tuple[str, Optional[int]]]:
         bucket = self._client.get_bucket(bucket_name)
 
         for key in bucket.list_blobs(prefix=prefix):
@@ -129,7 +129,7 @@ def get_table_names(
                         f"(storage_class: {storage_class})"
                     )
                     continue
-            yield key.name
+            yield key.name, key.size
 
     def close(self, service_connection):
         os.environ.pop("GOOGLE_CLOUD_PROJECT", "")
 
@@ -13,7 +13,7 @@
 Datalake S3 Client
 """
 from functools import partial
-from typing import Callable, Iterable, Optional, Set
+from typing import Callable, Iterable, Optional, Set, Tuple
 
 from metadata.clients.aws_client import AWSClient
 from metadata.generated.schema.entity.services.connections.database.datalake.s3Config import (
@@ -35,8 +35,16 @@ def from_config(cls, config: S3Config) -> "DatalakeS3Client":
         if not config.securityConfig:
             raise RuntimeError("S3Config securityConfig can't be None.")
 
-        s3_client = AWSClient(config.securityConfig).get_client(service_name="s3")
-        return cls(client=s3_client)
+        aws_client = AWSClient(config.securityConfig)
+        session = aws_client.create_session()
+        if config.securityConfig.endPointURL:
+            s3_client = session.client(
+                service_name="s3",
+                endpoint_url=str(config.securityConfig.endPointURL),
+            )
+        else:
+            s3_client = session.client(service_name="s3")
+        return cls(client=s3_client, session=session)
 
     def update_client_database(self, config, database_name: str):
         # For the S3 Client we don't need to do anything when changing the database
@@ -57,7 +65,7 @@ def get_table_names(
         bucket_name: str,
         prefix: Optional[str],
         skip_cold_storage: bool = False,
-    ) -> Iterable[str]:
+    ) -> Iterable[Tuple[str, Optional[int]]]:
         kwargs = {"Bucket": bucket_name}
 
         if prefix:
@@ -76,7 +84,7 @@ def get_table_names(
                         f"(StorageClass: {storage_class}, ArchiveStatus: {archive_status})"
                     )
                     continue
-            yield key["Key"]
+            yield key["Key"], key.get("Size")
 
     def get_folders_prefix(
         self, bucket_name: str, prefix: Optional[str]
 
@@ -226,7 +226,7 @@ def yield_database_schema(
 
     def get_tables_name_and_type(  # pylint: disable=too-many-branches
         self,
-    ) -> Iterable[Tuple[str, TableType, SupportedTypes]]:
+    ) -> Iterable[Tuple[str, TableType, SupportedTypes, Optional[int]]]:
         """
         Handle table and views.
 
@@ -251,7 +251,7 @@ def get_tables_name_and_type(  # pylint: disable=too-many-branches
             skip_cold_storage = (
                 getattr(self.service_connection, "skipColdStorage", False) or False
             )
-            for key_name in self.client.get_table_names(
+            for key_name, file_size in self.client.get_table_names(
                 bucket_name, prefix, skip_cold_storage=skip_cold_storage
             ):
                 table_name = self.standardize_table_name(bucket_name, key_name)
@@ -269,29 +269,31 @@ def get_tables_name_and_type(  # pylint: disable=too-many-branches
                     )
                     continue
 
-                yield table_name, TableType.Regular, file_extension
+                yield table_name, TableType.Regular, file_extension, file_size
 
     def yield_table(
-        self, table_name_and_type: Tuple[str, TableType, SupportedTypes]
+        self, table_name_and_type: Tuple[str, TableType, SupportedTypes, Optional[int]]
     ) -> Iterable[Either[CreateTableRequest]]:
         """
         From topology.
         Prepare a table request and pass it to the sink.
         Uses first chunk only for schema inference to avoid loading entire file.
         """
-        table_name, table_type, table_extension = table_name_and_type
+        table_name, table_type, table_extension, file_size = table_name_and_type
         schema_name = self.context.get().database_schema
         try:
             table_constraints = None
             data_frame, raw_data = fetch_dataframe_first_chunk(
                 config_source=self.config_source,
-                client=self.client._client,
+                client=self.client.client,
                 file_fqn=DatalakeTableSchemaWrapper(
                     key=table_name,
                     bucket_name=schema_name,
                     file_extension=table_extension,
+                    file_size=file_size,
                 ),
                 fetch_raw_data=True,
+                session=getattr(self.client, "session", None),
             )
             if data_frame:
                 data_frame = next(data_frame)
 
@@ -33,7 +33,6 @@
 from metadata.readers.dataframe.base import DataFrameReader, FileFormatException
 from metadata.readers.dataframe.models import DatalakeColumnWrapper
 from metadata.readers.file.adls import return_azure_storage_options
-from metadata.readers.file.s3 import return_s3_storage_options
 from metadata.readers.models import ConfigSource
 from metadata.utils.constants import CHUNKSIZE
 from metadata.utils.logger import ingestion_logger
@@ -109,19 +108,18 @@ def _read_avro_dispatch(
     @_read_avro_dispatch.register
     def _(self, _: S3Config, key: str, bucket_name: str) -> DatalakeColumnWrapper:
         """Stream Avro from S3 without loading entire file into memory."""
-        from s3fs import S3FileSystem
-
-        storage_options = return_s3_storage_options(self.config_source)
-        s3 = S3FileSystem(**storage_options)
-        file_path = f"s3://{bucket_name}/{key}"
-
-        with s3.open(file_path, "rb") as f:
-            columns = self._get_avro_columns(f)
+        schema_response = self.client.get_object(Bucket=bucket_name, Key=key)
+        try:
+            columns = self._get_avro_columns(schema_response["Body"])
+        finally:
+            schema_response["Body"].close()
 
         def chunk_generator():
             response = self.client.get_object(Bucket=bucket_name, Key=key)
-            file_stream = response["Body"]
-            yield from self._stream_avro_records(file_stream)
+            try:
+                yield from self._stream_avro_records(response["Body"])
+            finally:
+                response["Body"].close()
 
         return DatalakeColumnWrapper(
             columns=columns,
 
@@ -64,43 +64,44 @@ class DataFrameReader(ABC):
     config_source: ConfigSource
     reader: Reader
 
-    def __init__(self, config_source: ConfigSource, client: Optional[Any]):
+    def __init__(
+        self,
+        config_source: ConfigSource,
+        client: Optional[Any],
+        session: Optional[Any] = None,
+    ):
         self.config_source = config_source
         self.client = client
+        self.session = session
 
         self.reader = get_reader(config_source=config_source, client=client)
 
-    def _get_file_size_mb(self, key: str, bucket_name: str) -> float:
+    def _get_file_size_mb(
+        self, key: str, bucket_name: str, file_size: Optional[int] = None
+    ) -> float:
         """
         Get file size in MB. Returns 0 if unable to determine.
-        Uses efficient HEAD operations from cloud providers.
+        If file_size (bytes) is provided from listing metadata, uses that
+        to avoid a redundant HEAD/info API call.
         """
+        if file_size is not None:
+            return file_size / (1024 * 1024)
         try:
             if isinstance(self.config_source, S3Config):
                 response = self.client.head_object(Bucket=bucket_name, Key=key)
                 return response.get("ContentLength", 0) / (1024 * 1024)
 
             elif isinstance(self.config_source, GCSConfig):
-                from gcsfs import GCSFileSystem
-
-                gcs = GCSFileSystem()
-                file_path = f"gs://{bucket_name}/{key}"
-                file_info = gcs.info(file_path)
-                return file_info.get("size", 0) / (1024 * 1024)
+                bucket = self.client.get_bucket(bucket_name)
+                blob = bucket.get_blob(key)
+                return (blob.size or 0) / (1024 * 1024) if blob else 0
 
             elif isinstance(self.config_source, AzureConfig):
-                from adlfs import AzureBlobFileSystem
-
-                from metadata.readers.file.adls import return_azure_storage_options
-
-                storage_options = return_azure_storage_options(self.config_source)
-                adlfs_fs = AzureBlobFileSystem(
-                    account_name=self.config_source.securityConfig.accountName,
-                    **storage_options,
+                blob_client = self.client.get_blob_client(
+                    container=bucket_name, blob=key
                 )
-                file_path = f"{bucket_name}/{key}"
-                file_info = adlfs_fs.info(file_path)
-                return file_info.get("size", 0) / (1024 * 1024)
+                props = blob_client.get_blob_properties()
+                return (props.size or 0) / (1024 * 1024)
 
             elif isinstance(self.config_source, LocalConfig):
                 import os
 
@@ -34,7 +34,6 @@
 from metadata.readers.dataframe.base import DataFrameReader, FileFormatException
 from metadata.readers.dataframe.models import DatalakeColumnWrapper
 from metadata.readers.file.adls import AZURE_PATH, return_azure_storage_options
-from metadata.readers.file.s3 import return_s3_storage_options
 from metadata.readers.models import ConfigSource
 from metadata.utils.constants import CHUNKSIZE
 from metadata.utils.logger import ingestion_logger
@@ -116,9 +115,10 @@ def __init__(
         config_source: ConfigSource,
         client: Optional[Any],
         separator: str = CSV_SEPARATOR,
+        session: Optional[Any] = None,
     ):
         self.separator = separator
-        super().__init__(config_source, client)
+        super().__init__(config_source, client, session=session)
 
     def read_from_pandas(
         self,
@@ -173,12 +173,32 @@ def _(self, _: GCSConfig, key: str, bucket_name: str) -> DatalakeColumnWrapper:
 
     @_read_dsv_dispatch.register
     def _(self, _: S3Config, key: str, bucket_name: str) -> DatalakeColumnWrapper:
+        import pandas as pd  # pylint: disable=import-outside-toplevel
+
         compression = "gzip" if key.endswith(".gz") else None
 
-        storage_options = return_s3_storage_options(self.config_source)
-        path = f"s3://{bucket_name}/{key}"
-        return self.read_from_pandas(
-            path=path, storage_options=storage_options, compression=compression
+        def chunk_generator():
+            response = self.client.get_object(Bucket=bucket_name, Key=key)
+            try:
+                with pd.read_csv(
+                    response["Body"],
+                    sep=self.separator,
+                    chunksize=CHUNKSIZE,
+                    compression=compression,
+                    encoding_errors="ignore",
+                    escapechar="\\",
+                ) as reader:
+                    for chunks in reader:
+                        fixed = self._fix_malformed_quoted_chunk(
+                            chunk_list=[chunks], separator=self.separator
+                        )
+                        if fixed:
+                            yield fixed[0]
+            finally:
+                response["Body"].close()
+
+        return DatalakeColumnWrapper(
+            dataframes=chunk_generator, columns=None, raw_data=None
         )
 
     @_read_dsv_dispatch.register
 
@@ -158,7 +158,11 @@ def _is_json_lines(file_obj) -> bool:
             return False
 
     def _read_json_smart(
-        self, file_obj_getter, key: str, bucket_name: str
+        self,
+        file_obj_getter,
+        key: str,
+        bucket_name: str,
+        file_size: Optional[int] = None,
     ) -> DatalakeColumnWrapper:
         """
         Smart JSON reading with automatic format detection and streaming.
@@ -179,7 +183,7 @@ def chunk_generator():
                 dataframes=chunk_generator, raw_data=None, columns=None
             )
 
-        file_size_mb = self._get_file_size_mb(key, bucket_name)
+        file_size_mb = self._get_file_size_mb(key, bucket_name, file_size=file_size)
         if file_size_mb > (MAX_FILE_SIZE_FOR_PREVIEW / (1024 * 1024)):
             logger.info(
                 f"Large JSON file ({file_size_mb:.2f} MB). Streaming with ijson."
@@ -223,7 +227,9 @@ def get_stream():
             finally:
                 response["Body"].close()
 
-        return self._read_json_smart(get_stream, key, bucket_name)
+        return self._read_json_smart(
+            get_stream, key, bucket_name, file_size=self._file_size
+        )
 
     @_read_json_dispatch.register
     def _(self, _: GCSConfig, key: str, bucket_name: str) -> DatalakeColumnWrapper:
@@ -271,7 +277,10 @@ def get_stream():
 
         return self._read_json_smart(get_stream, key, bucket_name)
 
-    def _read(self, *, key: str, bucket_name: str, **__) -> DatalakeColumnWrapper:
+    def _read(
+        self, *, key: str, bucket_name: str, file_size: Optional[int] = None, **__
+    ) -> DatalakeColumnWrapper:
+        self._file_size = file_size
         return self._read_json_dispatch(
             self.config_source, key=key, bucket_name=bucket_name
         )