diff --git a/dtable_events/app/app.py b/dtable_events/app/app.py index c991af41..400ec465 100644 --- a/dtable_events/app/app.py +++ b/dtable_events/app/app.py @@ -23,6 +23,7 @@ from dtable_events.workflow.workflow_actions import WorkflowActionsHandler from dtable_events.workflow.workflow_schedules_scanner import WorkflowSchedulesScanner from dtable_events.convert_page.manager import conver_page_to_pdf_manager +from dtable_events.convert_page.process_monitor import browser_monitor from dtable_events.api_calls.api_calls_counter import APICallsCounter from dtable_events.tasks.dtable_file_access_log_cleaner import DTableFileAccessLogCleaner from dtable_events.activities.dtable_update_handler import DTableUpdateHander @@ -107,3 +108,4 @@ def serve_forever(self): self._virus_scanner.start() # default False # convert pdf manager conver_page_to_pdf_manager.start() # always True + browser_monitor.start() # always True diff --git a/dtable_events/convert_page/manager.py b/dtable_events/convert_page/manager.py index 8995b26f..5ce43991 100644 --- a/dtable_events/convert_page/manager.py +++ b/dtable_events/convert_page/manager.py @@ -1,182 +1,62 @@ -import io +import asyncio import logging -import os from queue import Queue, Full from threading import Thread -from dtable_events.app.config import INNER_DTABLE_DB_URL -from dtable_events.convert_page.utils import get_chrome_data_dir, get_driver, open_page_view, wait_page_view -from dtable_events.utils import get_inner_dtable_server_url, get_opt_from_conf_or_env -from dtable_events.utils.dtable_server_api import DTableServerAPI, NotFoundException +from playwright.async_api import async_playwright +from playwright._impl._errors import TimeoutError + +from dtable_events.app.config import INNER_DTABLE_DB_URL, DTABLE_WEB_SERVICE_URL +from dtable_events.convert_page.process_monitor import browser_monitor +from dtable_events.convert_page.utils import get_pdf_print_options +from dtable_events.utils import get_inner_dtable_server_url, get_opt_from_conf_or_env, uuid_str_to_36_chars from dtable_events.utils.dtable_db_api import DTableDBAPI +from dtable_events.utils.dtable_server_api import DTableServerAPI, NotFoundException logger = logging.getLogger(__name__) -dtable_server_url = get_inner_dtable_server_url() - - -class ConvertPageTOPDFManager: - - def __init__(self): - self.max_workers = 2 - self.max_queue = 1000 - self.drivers = {} - - def init(self, config): - section_name = 'CONERT-PAGE-TO-PDF' - key_max_workers = 'max_workers' - key_max_queue = 'max_queue' - - self.config = config - - if config.has_section('CONERT-PAGE-TO-PDF'): - try: - self.max_workers = int(get_opt_from_conf_or_env(config, section_name, key_max_workers, default=self.max_workers)) - except: - pass - try: - self.max_queue = int(get_opt_from_conf_or_env(config, section_name, key_max_queue, default=self.max_queue)) - except: - pass - self.queue = Queue(self.max_queue) # element in queue is a dict about task - try: # kill all existing chrome processes - os.system("ps aux | grep chrome | grep -v grep | awk ' { print $2 } ' | xargs kill -9 > /dev/null 2>&1") - except: - pass - - def get_driver(self, index): - driver = self.drivers.get(index) - if not driver: - driver = get_driver(get_chrome_data_dir(f'convert-manager-{index}')) - self.drivers[index] = driver - return driver - - def do_convert(self, index): - while True: - try: - task_info = self.queue.get() - logger.debug('do_convert task_info: %s', task_info) - ConvertPageToPDFWorker(task_info, index, self).work() - except Exception as e: - logger.exception('do task: %s error: %s', task_info, e) - - def start(self): - logger.debug('convert page to pdf max workers: %s max queue: %s', self.max_workers, self.max_queue) - for i in range(self.max_workers): - t_name = f'driver-{i}' - t = Thread(target=self.do_convert, args=(i,), name=t_name, daemon=True) - t.start() - def add_task(self, task_info): - try: - logger.debug('add task_info: %s', task_info) - self.queue.put(task_info, block=False) - except Full as e: - logger.warning('convert queue full task: %s will be ignored', task_info) - raise e +dtable_server_url = get_inner_dtable_server_url() - def clear_chrome(self, index): - driver = self.drivers.get(index) - if not driver: - logger.debug('no index %s chrome', index) - return - try: # delete all tab window except first blank - logger.debug('i: %s driver.window_handles[1:]: %s', index, driver.window_handles[1:]) - for window in driver.window_handles[1:]: - driver.switch_to.window(window) - driver.close() - # switch to the first tab window or error will occur when open new window - driver.switch_to.window(driver.window_handles[0]) - except Exception as e: - logger.exception('close driver: %s error: %s', index, e) - try: - driver.quit() - except Exception as e: - logger.exception('quit driver: %s error: %s', index, e) - self.drivers.pop(index, None) +class BrowserWorker(Thread): -class ConvertPageToPDFWorker: + def __init__(self, index, task_queue: Queue, pages=10): + super().__init__() + self.thread_id = index + self.task_queue = task_queue + self.playwright = None + self.browser = None + self.context = None + self.pages = pages - def __init__(self, task_info, index, manager: ConvertPageTOPDFManager): - self.task_info = task_info - self.index = index - self.manager = manager + self.is_browser_alive = False + self.browser_pid = None - def convert_with_rows(self, driver, resources): - dtable_uuid = self.task_info.get('dtable_uuid') - plugin_type = self.task_info.get('plugin_type') - page_id = self.task_info.get('page_id') - action_type = self.task_info.get('action_type') - per_converted_callbacks = self.task_info.get('per_converted_callbacks') or [] - all_converted_callbacks = self.task_info.get('all_converted_callbacks') or [] + self.loop = asyncio.new_event_loop() # each thread has own event loop - row_ids = resources.get('row_ids') - # resources in convert-page-to-pdf action - table = resources.get('table') - target_column = resources.get('target_column') + def disconnect_browser_cb(self): + self.is_browser_alive = False + self.browser = None + self.context = None + logger.error(f"Thread-{self.thread_id} browser disconnected... will use new browser") - dtable_server_api = DTableServerAPI('dtable-events', dtable_uuid, dtable_server_url) + async def get_context(self): + if not self.is_browser_alive: + logger.info(f"Thread-{self.thread_id} browser make a new browser...") - # convert - # open all tabs of rows step by step - # wait render and convert to pdf one by one - step = 10 - for i in range(0, len(row_ids), step): - try: - step_row_ids = row_ids[i: i+step] - row_session_dict = {} - # open rows - for row_id in step_row_ids: - session_id = open_page_view(driver, dtable_uuid, plugin_type, page_id, row_id, dtable_server_api.internal_access_token) - row_session_dict[row_id] = session_id - - # wait for chrome windows rendering - for row_id in step_row_ids: - output = io.BytesIO() # receive pdf content - session_id = row_session_dict[row_id] - wait_page_view(driver, session_id, plugin_type, row_id, output) - # per converted callbacks - pdf_content = output.getvalue() - if action_type == 'convert_page_to_pdf': - for callback in per_converted_callbacks: - try: - callback(row_id, pdf_content) - except Exception as e: - logging.exception(e) - except Exception as e: - logger.exception('convert task: %s error: %s', self.task_info, e) - continue - finally: - self.manager.clear_chrome(self.index) + if self.context: + return self.context - # callbacks - if action_type == 'convert_page_to_pdf': - for callback in all_converted_callbacks: - try: - callback(table, target_column) - except Exception as e: - logging.exception(e) - - def convert_without_rows(self, driver): - dtable_uuid = self.task_info.get('dtable_uuid') - plugin_type = self.task_info.get('plugin_type') - page_id = self.task_info.get('page_id') - action_type = self.task_info.get('action_type') - per_converted_callbacks = self.task_info.get('per_converted_callbacks') or [] - - dtable_server_api = DTableServerAPI('dtable-events', dtable_uuid, dtable_server_url) - - output = io.BytesIO() # receive pdf content - session_id = open_page_view(driver, dtable_uuid, plugin_type, page_id, None, dtable_server_api.internal_access_token) - wait_page_view(driver, session_id, plugin_type, None, output) - # per converted callback - pdf_content = output.getvalue() - if action_type == 'convert_document_to_pdf_and_send': - for callback in per_converted_callbacks: - try: - callback(pdf_content) - except Exception as e: - logging.exception(e) + if not self.playwright: + self.playwright = await async_playwright().start() + if not self.browser: + self.is_browser_alive = True + self.browser = await self.playwright.chromium.launch(headless=True) + self.browser.on('disconnected', self.disconnect_browser_cb) + self.context = await self.browser.new_context() + self.browser_pid = self.browser._impl_obj._connection._transport._proc.pid + browser_monitor.add_pid_info({self.browser_pid: {'name': f"BrowserWorker: Thread - {self.thread_id}"}}) + return self.context def check_resources(self, dtable_uuid, plugin_type, page_id, table_id, target_column_key, row_ids): """ @@ -239,41 +119,194 @@ def check_resources(self, dtable_uuid, plugin_type, page_id, table_id, target_co 'row_ids': row_ids }, None - def work(self): - dtable_uuid = self.task_info.get('dtable_uuid') - plugin_type = self.task_info.get('plugin_type') - page_id = self.task_info.get('page_id') - table_id = self.task_info.get('table_id') - target_column_key = self.task_info.get('target_column_key') - row_ids = self.task_info.get('row_ids') + async def row_page_to_pdf(self, url, context, row_id, action_type, per_converted_callbacks): + page = await context.new_page() + page.on("request", lambda request: logger.debug(f"Request: {request.method} {request.url}")) + page.on("response", lambda response: logger.debug(f"Response: {response.status} {response.url}")) + page.on("console", lambda msg: logger.debug(f"Console [{msg.type}]: {msg.text}")) + try: + await page.goto(url, wait_until="load") + await page.wait_for_load_state('networkidle', timeout=180*1000) + content = await page.pdf(**get_pdf_print_options()) + except TimeoutError: + content = await page.pdf(**get_pdf_print_options()) + await page.close() + if action_type == 'convert_page_to_pdf': + for callback in per_converted_callbacks: + try: + callback(row_id, content) + except Exception as e: + logger.exception(e) + + async def convert_with_rows(self, task_info, resources): + dtable_uuid = task_info.get('dtable_uuid') + plugin_type = task_info.get('plugin_type') + page_id = task_info.get('page_id') + action_type = task_info.get('action_type') + per_converted_callbacks = task_info.get('per_converted_callbacks') or [] + all_converted_callbacks = task_info.get('all_converted_callbacks') or [] + + row_ids = resources.get('row_ids') + # resources in convert-page-to-pdf action + table = resources.get('table') + target_column = resources.get('target_column') + + # convert + # open all tabs of rows pages by pages + # wait render and convert to pdf one by one + pages = self.pages + dtable_server_api = DTableServerAPI('dtable-events', dtable_uuid, dtable_server_url) + for i in range(0, len(row_ids), pages): + tasks = [] + context = await self.get_context() + # open rows + for row_id in row_ids[i: i+pages]: + url = '' + if plugin_type == 'page-design': + url = DTABLE_WEB_SERVICE_URL.strip('/') + '/dtable/%s/page-design/%s/row/%s/' % (uuid_str_to_36_chars(dtable_uuid), page_id, row_id) + if not url: + continue + url += '?access-token=%s&need_convert=%s' % (dtable_server_api.internal_access_token, 0) + + tasks.append(self.row_page_to_pdf(url, context, row_id, action_type, per_converted_callbacks)) + + results = await asyncio.gather(*tasks, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + logger.exception(f'Thread-{self.thread_id} convert rows error: {e}') + + # callbacks + if action_type == 'convert_page_to_pdf': + for callback in all_converted_callbacks: + try: + callback(table, target_column) + except Exception as e: + logger.exception(e) + + async def convert_without_rows(self, task_info): + dtable_uuid = task_info.get('dtable_uuid') + plugin_type = task_info.get('plugin_type') + page_id = task_info.get('page_id') + action_type = task_info.get('action_type') + per_converted_callbacks = task_info.get('per_converted_callbacks') or [] + + url = '' + if plugin_type == 'document': + url = DTABLE_WEB_SERVICE_URL.strip('/') + '/dtable/%s/document/%s/row/%s/' % (uuid_str_to_36_chars(dtable_uuid), page_id, None) + if not url: + return + + dtable_server_api = DTableServerAPI('dtable-events', dtable_uuid, dtable_server_url) + url += '?access-token=%s&need_convert=%s' % (dtable_server_api.access_token, 0) + + context = await self.get_context() + page = await context.new_page() + page.on("request", lambda request: logger.debug(f"Request: {request.method} {request.url}")) + page.on("response", lambda response: logger.debug(f"Response: {response.status} {response.url}")) + page.on("console", lambda msg: logger.debug(f"Console [{msg.type}]: {msg.text}")) + try: + await page.goto(url, wait_until="load") + await page.wait_for_load_state('networkidle', timeout=180*1000) + pdf_content = await page.pdf(**get_pdf_print_options()) + except TimeoutError: + pdf_content = await page.pdf(**get_pdf_print_options()) + + if action_type == 'convert_document_to_pdf_and_send': + for callback in per_converted_callbacks: + try: + callback(pdf_content) + except Exception as e: + logger.exception(e) + await page.close() + + async def _do_convert(self, task_info): + dtable_uuid = task_info.get('dtable_uuid') + plugin_type = task_info.get('plugin_type') + page_id = task_info.get('page_id') + table_id = task_info.get('table_id') + target_column_key = task_info.get('target_column_key') + row_ids = task_info.get('row_ids') # resource check # Rather than wait one minute to render a wrong page, a resources check is more effective try: resources, error_msg = self.check_resources(dtable_uuid, plugin_type, page_id, table_id, target_column_key, row_ids) if not resources: - logger.warning('plugin: %s dtable: %s page: %s task_info: %s error: %s', plugin_type, dtable_uuid, page_id, self.task_info, error_msg) + logger.warning('plugin: %s dtable: %s page: %s task_info: %s error: %s', plugin_type, dtable_uuid, page_id, task_info, error_msg) return row_ids = resources.get('row_ids') except Exception as e: - logger.exception('plugin: %s dtable: %s page: %s task_info: %s resource check error: %s', plugin_type, dtable_uuid, page_id, self.task_info, e) + logger.exception('plugin: %s dtable: %s page: %s task_info: %s resource check error: %s', plugin_type, dtable_uuid, page_id, task_info, e) return - try: - driver = self.manager.get_driver(self.index) - except Exception as e: - logger.exception('get driver: %s error: %s', self.index, e) - return + # browser context access url + if row_ids: + await self.convert_with_rows(task_info, resources) + else: + await self.convert_without_rows(task_info) + async def do_convert(self, task_info): try: - if row_ids is not None: # rows - self.convert_with_rows(driver, resources) - else: # no rows - self.convert_without_rows(driver) + await self._do_convert(task_info) except Exception as e: - logger.exception(e) + logger.exception(f'do convert Thread-{self.thread_id} Exception in loop.run_until_complete - {e}') + try: + await self.browser.close() + except Exception as e: + logger.exception(f'do convert Thread-{self.thread_id} close browser error: {e}') + finally: + self.context = None + self.browser = None finally: - self.manager.clear_chrome(self.index) + if self.browser_pid: + browser_monitor.remove_pid(self.browser_pid) + + def run(self): + asyncio.set_event_loop(self.loop) + while True: + task_info = self.task_queue.get() + + try: + self.loop.run_until_complete(self.do_convert(task_info)) + except Exception as e: + logger.exception(f'Thread-{self.thread_id} Exception in loop.run_until_complete - {e}') + + +class ConvertPageToPDFManager: + + def __init__(self): + self.max_workers = 2 + self.max_queue = 1000 + self.pages = 10 + + def init(self, config): + section_name = 'CONERT-PAGE-TO-PDF' + key_max_workers = 'max_workers' + key_max_queue = 'max_queue' + key_pages = 'pages' + + self.config = config + + if config.has_section('CONERT-PAGE-TO-PDF'): + self.max_workers = int(get_opt_from_conf_or_env(config, section_name, key_max_workers, default=self.max_workers)) + self.max_queue = int(get_opt_from_conf_or_env(config, section_name, key_max_queue, default=self.max_queue)) + self.pages = int(get_opt_from_conf_or_env(config, section_name, key_pages, default=self.pages)) + + self.queue = Queue(self.max_queue) # element in queue is a dict about task + + def start(self): + logger.debug('convert page to pdf max workers: %s max queue: %s pages: %s', self.max_workers, self.max_queue, self.pages) + for i in range(self.max_workers): + t = BrowserWorker(i, self.queue, self.pages) + t.start() + + def add_task(self, task_info): + try: + logger.debug('add task_info: %s', task_info) + self.queue.put(task_info, block=False) + except Full as e: + logger.warning('convert queue full task: %s will be ignored', task_info) + raise e -conver_page_to_pdf_manager = ConvertPageTOPDFManager() +conver_page_to_pdf_manager = ConvertPageToPDFManager() diff --git a/dtable_events/convert_page/process_monitor.py b/dtable_events/convert_page/process_monitor.py new file mode 100644 index 00000000..83101503 --- /dev/null +++ b/dtable_events/convert_page/process_monitor.py @@ -0,0 +1,72 @@ +import logging +import time +from threading import Thread + +try: + import psutil +except: + psutil = None + +from dtable_events.app.log import setup_logger + +monitor_logger = setup_logger('browser-monitor.log') + + +class ProcessMonitor(Thread): + def __init__(self, interval=1): + super(ProcessMonitor, self).__init__() + self.pid_infos = dict() + self.interval = interval # refresh interval + self.running = False + self.daemon = True + # self.monitor_logger = setup_logger('browser-monitor.log') + + def can_monitor(self): + return monitor_logger.root.level == logging.DEBUG + # return self.monitor_logger.root.level == logging.DEBUG + + def add_pid_info(self, pid_info): + if not self.can_monitor(): + return + self.pid_infos.update(pid_info) + + def remove_pid(self, pid): + if not self.can_monitor(): + return + self.pid_infos.pop(pid, None) + + def run(self): + if not self.can_monitor(): + monitor_logger.info("No monitoring!") + return + monitor_logger.info("Starting monitoring...") + self.running = True + while self.running: + for pid in list(self.pid_infos.keys()): + total_cpu = 0.0 + total_memory = 0.0 + try: + process = psutil.Process(pid) + total_cpu += process.cpu_percent(interval=0) + total_memory += process.memory_info().rss + + # stats sub-processes + for child in process.children(recursive=True): + try: + total_cpu += child.cpu_percent(interval=0) + total_memory += child.memory_info().rss + except psutil.NoSuchProcess: + continue + + except psutil.NoSuchProcess: + monitor_logger.info(f"pid: {self.pid_infos[pid]} not exists, removed!") + self.pid_infos.pop(pid, None) # remove when pid not exists longer + continue + + total_memory_mb = total_memory / (1024 * 1024) # convert to MB + monitor_logger.info(f"pid: {self.pid_infos.get(pid)} Total CPU: {total_cpu:.2f}%, Total Memory: {total_memory_mb:.2f} MB") + + time.sleep(self.interval) + + +browser_monitor = ProcessMonitor() diff --git a/dtable_events/convert_page/utils.py b/dtable_events/convert_page/utils.py index 37aec35f..2931b91d 100644 --- a/dtable_events/convert_page/utils.py +++ b/dtable_events/convert_page/utils.py @@ -1,155 +1,7 @@ -import base64 -import io -import json -import logging -import time -import os - -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.support.ui import WebDriverWait - -from dtable_events.app.config import DTABLE_WEB_SERVICE_URL -from dtable_events.utils import uuid_str_to_36_chars - -logger = logging.getLogger(__name__) - -CHROME_DATA_DIR = '/tmp/chrome-user-datas' - - -def get_chrome_data_dir(dir_name='tmp'): - if not os.path.isdir(CHROME_DATA_DIR): - os.makedirs(CHROME_DATA_DIR) - return os.path.join(CHROME_DATA_DIR, dir_name) - - -def get_driver(user_data_path): - webdriver_options = Options() - - webdriver_options.add_argument('--no-sandbox') - webdriver_options.add_argument('--headless') - webdriver_options.add_argument('--disable-gpu') - webdriver_options.add_argument('--disable-dev-shm-usage') - webdriver_options.add_argument(f'--user-data-dir={user_data_path}') - - driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=webdriver_options) - return driver - - -def open_page_view(driver: webdriver.Chrome, dtable_uuid, plugin_type, page_id, row_id, access_token): - if plugin_type == 'page-design': - url = DTABLE_WEB_SERVICE_URL.strip('/') + '/dtable/%s/page-design/%s/' % (uuid_str_to_36_chars(dtable_uuid), page_id) - if row_id: - url = DTABLE_WEB_SERVICE_URL.strip('/') + '/dtable/%s/page-design/%s/row/%s/' % (uuid_str_to_36_chars(dtable_uuid), page_id, row_id) - elif plugin_type == 'document': - url = DTABLE_WEB_SERVICE_URL.strip('/') + '/dtable/%s/document/%s/row/%s/' % (uuid_str_to_36_chars(dtable_uuid), page_id, row_id) - - url += '?access-token=%s&need_convert=%s' % (access_token, 0) - logger.debug('check url: %s', url) - driver.execute_script(f"window.open('{url}')") - return driver.window_handles[-1] - - -def wait_page_view(driver: webdriver.Chrome, session_id, plugin_type, row_id, output): - def check_images_and_networks(driver, frequency=0.5): - """ - make sure all images complete - make sure no new connections in 0.5s. - TODO: Unreliable and need to be continuously updated. - """ - images_done = driver.execute_script(''' - let p = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; - let entries = p.getEntries(); - let images = Array.from(document.images).filter(image => image.src.indexOf('/asset/') !== -1); - if (images.length === 0) return true; - return images.filter(image => image.complete).length == images.length; - ''') - if not images_done: - return False - - entries_count = None - while True: - now_entries_count = driver.execute_script(''' - let p = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; - return p.getEntries().length; - ''') - if entries_count is None: - entries_count = now_entries_count - time.sleep(frequency) - continue - else: - if now_entries_count == entries_count and \ - driver.execute_script("return document.readyState === 'complete'"): - return True - break - return False - - await_react_render = 60 - # sleep_time = 2 - # if not row_id: - # await_react_render = 180 - # sleep_time = 6 - - driver.switch_to.window(session_id) - - monitor_dom_id = '' - if plugin_type == 'page-design': - monitor_dom_id = 'page-design-render-complete' - elif plugin_type == 'document': - monitor_dom_id = 'document-render-complete' - - try: - logger.debug('check to wait render') - # make sure react is rendered, timeout await_react_render, rendering is not completed within 3 minutes, and rendering performance needs to be improved - WebDriverWait(driver, await_react_render).until(lambda driver: driver.find_element_by_id(monitor_dom_id) is not None, message='wait react timeout') - logger.debug('check to wait images') - # make sure images from asset are rendered, timeout 120s - WebDriverWait(driver, 120, poll_frequency=1).until(lambda driver: check_images_and_networks(driver), message='wait images and networks timeout') - logger.debug('check to sleep') - # time.sleep(sleep_time) # test how non-sleep effects renderring - except Exception as e: - logger.warning('wait for page error: %s', e) - finally: - calculated_print_options = { - 'landscape': False, - 'displayHeaderFooter': False, - 'printBackground': True, - 'preferCSSPageSize': True, - } - - resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id - url = driver.command_executor._url + resource - body = json.dumps({'cmd': 'Page.printToPDF', 'params': calculated_print_options}) - - try: - logger.debug('check to export pdf') - response = driver.command_executor._request('POST', url, body) - logger.debug('check to output') - if not response: - logger.error('execute printToPDF error no response') - v = response.get('value')['data'] - if isinstance(output, str): - with open(output, 'wb') as f: - f.write(base64.b64decode(v)) - elif isinstance(output, io.BytesIO): - output.write(base64.b64decode(v)) - logger.info('check to convert page to pdf success!') - except Exception as e: - logger.exception('execute printToPDF error: {}'.format(e)) - - # debug page-design view in chrome, console log and network log, don't delete - logger.debug('browser console start') - for log in list(driver.get_log('browser')): - logger.debug(log) - logger.debug('browser console end') - network_logs = driver.execute_script("var performance = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; var network = performance.getEntriesByType('resource') || {}; return network;") - logger.debug('network logs start') - for item in network_logs: - logger.debug('check name: %s start: %s duration: %s end: %s', item.get('name'), item.get('startTime'), item.get('duration'), item.get('responseEnd')) - logger.debug(item) - logger.debug('network logs end') - - -def convert_page_to_pdf(driver: webdriver.Chrome, dtable_uuid, plugin_type, page_id, row_id, access_token, output): - session_id = open_page_view(driver, dtable_uuid, plugin_type, page_id, row_id, access_token) - wait_page_view(driver, session_id, plugin_type, row_id, output) +def get_pdf_print_options(): + return { + 'landscape': False, + 'display_header_footer': False, + 'print_background': True, + 'prefer_css_page_size': True + } diff --git a/dtable_events/dtable_io/__init__.py b/dtable_events/dtable_io/__init__.py index c3fff1cc..12436475 100644 --- a/dtable_events/dtable_io/__init__.py +++ b/dtable_events/dtable_io/__init__.py @@ -1,9 +1,12 @@ +import asyncio import os import shutil import uuid import requests from datetime import datetime +from playwright.async_api import async_playwright +from playwright._impl._errors import TimeoutError from seaserv import seafile_api @@ -25,7 +28,6 @@ import_excel_csv_add_table_by_dtable_server, update_parsed_file_by_dtable_server, \ parse_update_excel_upload_excel_to_json, parse_update_csv_upload_csv_to_json, parse_and_import_excel_csv_to_dtable, \ parse_and_import_excel_csv_to_table, parse_and_update_file_to_table, parse_and_append_excel_csv_to_table -from dtable_events.convert_page.utils import get_chrome_data_dir, convert_page_to_pdf as _convert_page_to_pdf, get_driver from dtable_events.statistics.db import save_email_sending_records, batch_save_email_sending_records from dtable_events.data_sync.data_sync_utils import run_sync_emails from dtable_events.utils import get_inner_dtable_server_url, is_valid_email, uuid_str_to_36_chars @@ -34,12 +36,17 @@ from dtable_events.utils.email_sender import EmailSender from dtable_events.dtable_io.utils import clear_tmp_dir, clear_tmp_file, clear_tmp_files_and_dirs from dtable_events.app.log import setup_logger +from dtable_events.convert_page.process_monitor import browser_monitor +from dtable_events.convert_page.utils import get_pdf_print_options dtable_io_logger = setup_logger('dtable_events_io.log') dtable_message_logger = setup_logger('dtable_events_message.log') dtable_data_sync_logger = setup_logger('dtable_events_data_sync.log') dtable_plugin_email_logger = setup_logger('dtable_events_plugin_email.log') +convert_pdf_loop = asyncio.new_event_loop() +asyncio.set_event_loop(convert_pdf_loop) + def get_dtable_export_content(username, repo_id, workspace_id, dtable_uuid, asset_dir_id, config): """ @@ -662,6 +669,7 @@ def send_notification_msg(emails, user_col_key, msg, dtable_uuid, username, tabl dtable_message_logger.info('Notification sending success!') return result + def convert_page_to_pdf(dtable_uuid, plugin_type, page_id, row_id, username=None): dtable_server_url = get_inner_dtable_server_url() if not username: @@ -672,16 +680,36 @@ def convert_page_to_pdf(dtable_uuid, plugin_type, page_id, row_id, username=None os.makedirs(target_dir) target_path = os.path.join(target_dir, '%s_%s_%s.pdf' % (dtable_uuid, page_id, row_id)) - chrome_data_dir_name = f'{dtable_uuid}-{page_id}-{row_id}' - driver = get_driver(get_chrome_data_dir(chrome_data_dir_name)) - try: - _convert_page_to_pdf(driver, dtable_uuid, plugin_type, page_id, row_id, access_token, target_path) - except Exception as e: - dtable_io_logger.exception('convert dtable: %s page: %s row: %s error: %s', dtable_uuid, page_id, row_id, e) - finally: - if os.path.exists(chrome_data_dir_name): - shutil.rmtree(chrome_data_dir_name) - driver.quit() + async def access_and_save(): + url = '' + if plugin_type == 'page-design': + url = DTABLE_WEB_SERVICE_URL.strip('/') + '/dtable/%s/page-design/%s/row/%s/' % (uuid_str_to_36_chars(dtable_uuid), page_id, row_id) + elif plugin_type == 'document': + url = DTABLE_WEB_SERVICE_URL.strip('/') + '/dtable/%s/document/%s/row/%s/' % (uuid_str_to_36_chars(dtable_uuid), page_id, row_id) + if not url: + return + url += '?access-token=%s&need_convert=%s' % (access_token, 0) + dtable_io_logger.debug('convert_page_to_pdf url: %s', url) + async with async_playwright() as playwright: + browser = await playwright.chromium.launch(headless=True) + context = await browser.new_context() + page = await context.new_page() + pid = browser._impl_obj._connection._transport._proc.pid + browser_monitor.add_pid_info({pid: {'name': f"dtable-io dtable_uuid: {dtable_uuid} plugin: {plugin_type} page_id: {page_id} row_id: {row_id}"}}) + try: + page.on("request", lambda request: dtable_io_logger.debug(f"Request: {request.method} {request.url}")) + page.on("response", lambda response: dtable_io_logger.debug(f"Response: {response.status} {response.url}")) + page.on("console", lambda msg: dtable_io_logger.debug(f"Console [{msg.type}]: {msg.text}")) + await page.goto(url, wait_until="load") + await page.wait_for_load_state('networkidle', timeout=180*1000) + await page.pdf(path=target_path, **get_pdf_print_options()) + except TimeoutError: + dtable_io_logger.exception('dtable: %s plugin: %s page: %s row: %s timeout', dtable_uuid, plugin_type, page_id, row_id) + await page.pdf(path=target_path, **get_pdf_print_options()) + except Exception as e: + dtable_io_logger.exception('dtable: %s plugin: %s page: %s row: %s error: %s', dtable_uuid, plugin_type, page_id, row_id, e) + + convert_pdf_loop.run_until_complete(access_and_save()) def convert_view_to_excel(dtable_uuid, table_id, view_id, username, id_in_org, user_department_ids_map, permission, name, repo_id, is_support_image=False): diff --git a/requirements.txt b/requirements.txt index 80ac90b0..0a525b18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,5 @@ Flask==2.2.* python-dateutil==2.8.* imapclient==2.3.* msal==1.31.0 +playwright==1.48.0 +psutil==6.1.0