Skip to content

Commit cd0b259

Browse files
google-drive connector (#27197)
1 parent dd73bd9 commit cd0b259

10 files changed

Lines changed: 2312 additions & 2 deletions

File tree

ingestion/setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# https://github.com/open-metadata/OpenMetadata/actions/runs/15640676139/job/44066998708?pr=21719 Copyright 2025 Collate
1+
# Copyright 2025 Collate
22
# Licensed under the Collate Community License, Version 1.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -193,6 +193,9 @@
193193
"azuresql": {VERSIONS["pyodbc"]},
194194
"azure-sso": {VERSIONS["msal"]},
195195
"backup": {VERSIONS["boto3"], VERSIONS["azure-identity"], "azure-storage-blob"},
196+
"googledrive": {
197+
"google-api-python-client>=2.0.0",
198+
},
196199
"bigquery": {
197200
"google-cloud-datacatalog>=3.6.2",
198201
"google-cloud-logging",
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
source:
2+
type: googledrive
3+
serviceName: local_googledrive
4+
serviceConnection:
5+
config:
6+
type: GoogleDrive
7+
credentials:
8+
gcpConfig:
9+
type: service_account
10+
projectId: project_id
11+
privateKeyId: private_key_id
12+
privateKey: private_key
13+
clientEmail: gcpuser@project_id.iam.gserviceaccount.com
14+
clientId: client_id
15+
authUri: https://accounts.google.com/o/oauth2/auth
16+
tokenUri: https://oauth2.googleapis.com/token
17+
authProviderX509CertUrl: https://www.googleapis.com/oauth2/v1/certs
18+
clientX509CertUrl: https://www.googleapis.com/oauth2/v1/certs
19+
# includeTeamDrives: true
20+
# includeGoogleSheets: false
21+
sourceConfig:
22+
config:
23+
type: DriveMetadata
24+
sink:
25+
type: metadata-rest
26+
config: {}
27+
workflowConfig:
28+
# loggerLevel: INFO # DEBUG, INFO, WARN or ERROR
29+
openMetadataServerConfig:
30+
hostPort: http://localhost:8585/api
31+
authProvider: openmetadata
32+
securityConfig:
33+
jwtToken: "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright 2025 Collate
2+
# Licensed under the Collate Community License, Version 1.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
# Copyright 2025 Collate
2+
# Licensed under the Collate Community License, Version 1.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
"""
13+
Google Drive connection and helpers
14+
"""
15+
import traceback
16+
from functools import partial
17+
from typing import Optional
18+
19+
from google.auth import default
20+
from googleapiclient.discovery import Resource, build
21+
22+
from metadata.generated.schema.entity.automations.workflow import (
23+
Workflow as AutomationWorkflow,
24+
)
25+
from metadata.generated.schema.entity.services.connections.drive.googleDriveConnection import (
26+
GoogleDriveConnection,
27+
)
28+
from metadata.generated.schema.entity.services.connections.testConnectionResult import (
29+
TestConnectionResult,
30+
)
31+
from metadata.ingestion.connections.test_connections import (
32+
SourceConnectionException,
33+
test_connection_steps,
34+
)
35+
from metadata.ingestion.ometa.ometa_api import OpenMetadata
36+
from metadata.utils.constants import THREE_MIN
37+
from metadata.utils.credentials import set_google_credentials
38+
from metadata.utils.logger import ingestion_logger
39+
40+
logger = ingestion_logger()
41+
42+
43+
class GoogleDriveClient:
44+
"""
45+
Wrapper around Google Sheets and Drive API clients
46+
"""
47+
48+
def __init__(self, sheets_service: Resource, drive_service: Resource):
49+
self.sheets_service = sheets_service
50+
self.drive_service = drive_service
51+
52+
53+
def get_connection(connection: GoogleDriveConnection) -> GoogleDriveClient:
54+
"""
55+
Create connection to Google Drive
56+
"""
57+
scopes = (
58+
connection.scopes
59+
if hasattr(connection, "scopes") and connection.scopes
60+
else [
61+
"https://www.googleapis.com/auth/spreadsheets.readonly",
62+
"https://www.googleapis.com/auth/drive.readonly",
63+
"https://www.googleapis.com/auth/drive.metadata.readonly",
64+
]
65+
)
66+
67+
# Set Google credentials using the utility function
68+
set_google_credentials(gcp_credentials=connection.credentials)
69+
70+
# Get default credentials - this will use the credentials set by set_google_credentials
71+
credentials, _ = default(scopes=scopes)
72+
73+
# Handle impersonation if configured
74+
if (
75+
connection.credentials.gcpImpersonateServiceAccount
76+
and connection.credentials.gcpImpersonateServiceAccount.impersonateServiceAccount
77+
):
78+
from google.auth import ( # pylint: disable=import-outside-toplevel
79+
impersonated_credentials,
80+
)
81+
82+
credentials = impersonated_credentials.Credentials(
83+
source_credentials=credentials,
84+
target_principal=connection.credentials.gcpImpersonateServiceAccount.impersonateServiceAccount,
85+
target_scopes=scopes,
86+
lifetime=connection.credentials.gcpImpersonateServiceAccount.lifetime,
87+
)
88+
89+
# Build the services
90+
sheets_service = build("sheets", "v4", credentials=credentials)
91+
drive_service = build("drive", "v3", credentials=credentials)
92+
93+
return GoogleDriveClient(sheets_service, drive_service)
94+
95+
96+
def test_connection(
97+
metadata: OpenMetadata,
98+
client: GoogleDriveClient,
99+
service_connection: GoogleDriveConnection,
100+
automation_workflow: Optional[AutomationWorkflow] = None,
101+
timeout_seconds: Optional[int] = THREE_MIN,
102+
) -> TestConnectionResult:
103+
"""
104+
Test connection to Google Drive
105+
"""
106+
logger.info("Starting Google Drive test connection")
107+
108+
def check_access():
109+
"""
110+
Check if we can access Google Drive API
111+
"""
112+
try:
113+
# Try to get user info - this will fail if credentials are invalid
114+
about = client.drive_service.about().get(fields="user").execute()
115+
user_email = about.get("user", {}).get("emailAddress", "Unknown")
116+
logger.info(f"Successfully authenticated as: {user_email}")
117+
except Exception as exc:
118+
logger.debug(f"Access check error traceback: {traceback.format_exc()}")
119+
raise SourceConnectionException(f"Failed to access Google Drive API: {exc}")
120+
121+
def get_drive_files():
122+
"""
123+
Test listing drive files
124+
"""
125+
try:
126+
logger.info("Testing Google Drive file listing")
127+
128+
# Query for a small number of files to test access
129+
query = "trashed=false"
130+
131+
results = (
132+
client.drive_service.files()
133+
.list(
134+
q=query,
135+
pageSize=5,
136+
fields="files(id, name, mimeType)",
137+
supportsAllDrives=True,
138+
includeItemsFromAllDrives=True,
139+
)
140+
.execute()
141+
)
142+
143+
files = results.get("files", [])
144+
logger.info(f"Found {len(files)} files in Drive (sample)")
145+
146+
# Also test for shared drives
147+
logger.info("Testing shared drive access")
148+
try:
149+
shared_results = (
150+
client.drive_service.drives()
151+
.list(pageSize=5, fields="drives(id, name)")
152+
.execute()
153+
)
154+
shared_drives = shared_results.get("drives", [])
155+
logger.info(f"Found {len(shared_drives)} shared drives")
156+
for drive in shared_drives:
157+
logger.info(
158+
f"Shared drive: {drive.get('name')} (ID: {drive.get('id')})"
159+
)
160+
except Exception as shared_exc:
161+
logger.warning(f"Could not access shared drives: {shared_exc}")
162+
163+
except Exception as exc:
164+
logger.debug(f"Drive files test error traceback: {traceback.format_exc()}")
165+
raise SourceConnectionException(f"Failed to list drive files: {exc}")
166+
167+
def get_spreadsheets(include_sheets: bool = False):
168+
"""
169+
Test listing spreadsheets if Google Sheets is included
170+
"""
171+
if not include_sheets:
172+
return
173+
174+
try:
175+
logger.info("Testing Google Sheets spreadsheet listing")
176+
177+
# Query for Google Sheets files
178+
query = (
179+
"mimeType='application/vnd.google-apps.spreadsheet' and trashed=false"
180+
)
181+
182+
results = (
183+
client.drive_service.files()
184+
.list(q=query, pageSize=5, fields="files(id, name)")
185+
.execute()
186+
)
187+
188+
files = results.get("files", [])
189+
logger.info(f"Found {len(files)} spreadsheets")
190+
191+
except Exception as exc:
192+
logger.debug(f"Spreadsheet test error traceback: {traceback.format_exc()}")
193+
raise SourceConnectionException(f"Failed to list spreadsheets: {exc}")
194+
195+
test_fn = {
196+
"CheckAccess": check_access,
197+
"GetDriveFiles": get_drive_files,
198+
"GetSpreadsheets": partial(
199+
get_spreadsheets, include_sheets=service_connection.includeGoogleSheets
200+
),
201+
}
202+
203+
return test_connection_steps(
204+
metadata=metadata,
205+
test_fn=test_fn,
206+
service_type=service_connection.type.value,
207+
automation_workflow=automation_workflow,
208+
timeout_seconds=timeout_seconds,
209+
)

0 commit comments

Comments
 (0)