diff --git a/apps/authentication/webapi/providers/database.py b/apps/authentication/webapi/providers/database.py index 3e2d2b1..c54f393 100644 --- a/apps/authentication/webapi/providers/database.py +++ b/apps/authentication/webapi/providers/database.py @@ -1,3 +1,4 @@ +import logging from common.config.app_settings import app_settings from beanie import init_beanie from motor.motor_asyncio import AsyncIOMotorClient @@ -8,6 +9,9 @@ def register(app): app.debug = "auth_mongo_debug" app.title = "auth_mongo_name" + # Configure logging for pymongo + logging.getLogger("pymongo").setLevel(logging.WARNING) # Suppress DEBUG logs + @app.on_event("startup") async def start_database(): await initiate_database() diff --git a/apps/content/backend/content/content_service.py b/apps/content/backend/content/content_service.py index 50d905c..832b7c8 100644 --- a/apps/content/backend/content/content_service.py +++ b/apps/content/backend/content/content_service.py @@ -1,35 +1,75 @@ -from typing import Dict, List, Optional +import asyncio +from typing import List, Optional from datetime import datetime, timedelta, timezone -from .constants import ContentSource, ContentMediaType, ContentDataFormat from .models import ContentDirectory, ContentFolderDoc -from common.config.app_settings import app_settings -from backend.document.document_manager import DocumentManager from common.constants.region import UserRegion +from backend.document.document_manager import DocumentManager from .content_sharepoint_manager import ContentSharePointManager -import pytz +from backend.content.constants import ContentSource class ContentService: def __init__(self) -> None: - pass + self.sharepoint_manager = ContentSharePointManager() + self.expiry_time = timedelta(hours=24) async def retrieve_directories_for_folder( self, folder_name: str, region: UserRegion ) -> List[ContentDirectory]: + # Check cache + folder_key = f"{folder_name}/{region.name}" folder = await ContentFolderDoc.find_one( - ContentFolderDoc.folder_name == folder_name, - ContentFolderDoc.region == region + ContentFolderDoc.folder_name == folder_key, + ContentFolderDoc.region == region, ) - - if folder is None or folder.valid_thru.replace(tzinfo=timezone.utc) < datetime.now(timezone.utc): - await ContentSharePointManager().retrieve_directories_for_folder(folder_name=folder_name, region=region) - folder = await ContentFolderDoc.find_one( - ContentFolderDoc.folder_name == folder_name, - ContentFolderDoc.region == region + # Refresh cache if expired or not present + if folder is None or folder.valid_thru.replace( + tzinfo=timezone.utc + ) < datetime.now(timezone.utc): + folder = await self.__refresh_folder_from_sharepoint(folder_name, region) + + return folder.content_directories if folder else [] + + async def __refresh_folder_from_sharepoint( + self, folder_name: str, region: UserRegion + ) -> ContentFolderDoc: + content_folder_name = f"{folder_name}/{region.name}" + sp_folders = self.sharepoint_manager.list_sub_folders(content_folder_name) + current_time = datetime.now(timezone.utc) + + # Create or update folder metadata + folder = await ContentFolderDoc.find_one( + ContentFolderDoc.folder_name == content_folder_name + ) + if folder is None: + folder = ContentFolderDoc( + folder_name=content_folder_name, + content_directories=[], + udpate_time=current_time, + update_source=ContentSource.SHAREPOINT, + valid_thru=current_time + self.expiry_time, + region=region, ) - - return folder.content_directories if folder else None + else: + folder.content_directories.clear() + + # Process subfolders in parallel + tasks = [ + self.sharepoint_manager.process_subfolder(content_folder_name, sp_folder) + for sp_folder in sp_folders + ] + try: + folder.content_directories = await asyncio.gather(*tasks) + except Exception as e: + raise RuntimeError(f"Failed to process subfolders: {e}") + + # Save folder metadata + folder.udpate_time = current_time + folder.valid_thru = current_time + self.expiry_time + await folder.save() + + return folder async def retrieve_content_as_media_data(self, document_id: str) -> Optional[str]: document_manager = DocumentManager() diff --git a/apps/content/backend/content/content_sharepoint_manager.py b/apps/content/backend/content/content_sharepoint_manager.py index abde989..77081e6 100644 --- a/apps/content/backend/content/content_sharepoint_manager.py +++ b/apps/content/backend/content/content_sharepoint_manager.py @@ -1,14 +1,12 @@ -from typing import Dict, List, Optional -from common.config.app_settings import app_settings -from datetime import datetime, timedelta, timezone +import asyncio +from datetime import datetime, timezone, timedelta +from typing import List, Dict from backend.sharepoint.sharepoint_graph_client import SharePointGraphClient -from common.constants.region import UserRegion from backend.document.document_manager import DocumentManager -from backend.content.constants import ( - ContentSource, - ContentFileConstants, -) from backend.content.models import ContentDirectory, ContentFolderDoc +from backend.content.constants import ContentFileConstants, ContentSource + +from common.config.app_settings import app_settings class ContentSharePointManager: @@ -25,127 +23,62 @@ class ContentSharePointManager: ) self.share_point_file_expiry = timedelta(hours=24) - def __generate_created__by__(self, folder_name): - return "content-service-" + folder_name.replace("/", "-").lower() + def list_sub_folders(self, folder_name: str) -> List[dict]: + """ + Fetches the subfolders under the specified folder from SharePoint. + """ + try: + # Use SharePointGraphClient to list subfolders + return self.sharepoint_client.list_sub_folders(folder_name) + except Exception as e: + raise ValueError(f"Failed to list subfolders for {folder_name}: {e}") - async def retrieve_directories_for_folder( - self, folder_name: str, region: UserRegion - ): - content_folder_name = folder_name + "/" + region.name - sp_folders = self.sharepoint_client.list_sub_folders(content_folder_name) - current_time = datetime.now(timezone.utc) - - folder = await ContentFolderDoc.find_one( - ContentFolderDoc.folder_name == content_folder_name + async def process_subfolder( + self, content_folder_name: str, sp_folder: dict + ) -> ContentDirectory: + content_directory = ContentDirectory(content_name=sp_folder["name"]) + sp_files = self.sharepoint_client.list_files( + f"{content_folder_name}/{sp_folder['name']}" ) - if folder is None: - folder = ContentFolderDoc( - folder_name=folder_name, - content_directories=[], - udpate_time=current_time, - update_source=ContentSource.SHAREPOINT, - valid_thru=current_time + self.share_point_file_expiry, - region=region, + # Process files in parallel + tasks = [ + self.__process_file(file, content_directory, content_folder_name) + for file in sp_files + ] + await asyncio.gather(*tasks) + + return content_directory + + async def __process_file( + self, sp_file: dict, content_directory: ContentDirectory, folder_name: str + ): + document_manager = DocumentManager() + file_content = self.sharepoint_client.get_file_content(sp_file["id"]) + + if sp_file["name"].lower() == ContentFileConstants.COVER_FILE_NAME.lower(): + content_directory.cover_document_id = ( + await document_manager.save_document_file( + self.__generate_created_by(folder_name), + sp_file["name"], + file_content, + ) ) - else: - folder.content_directories.clear() - - for sp_folder in sp_folders: - content_directory = ContentDirectory( - content_name=sp_folder["name"], - cover_document_id=None, - summary_text=None, - title_text=None, - content_link=None, - content_document_id=None, + elif sp_file["name"].lower() == ContentFileConstants.SUMMARY_FILE_NAME.lower(): + content_directory.summary_text = file_content + elif sp_file["name"].lower() == ContentFileConstants.TITLE_FILE_NAME.lower(): + content_directory.title_text = file_content + elif ( + sp_file["name"].lower() + == ContentFileConstants.CONTENT_PDF_FILE_NAME.lower() + ): + content_directory.content_document_id = ( + await document_manager.save_document_file( + self.__generate_created_by(folder_name), + sp_file["name"], + file_content, + ) ) - sp_files = self.sharepoint_client.list_files( - content_folder_name + "/" + sp_folder["name"] - ) - for sp_file in sp_files: - if ( - sp_file["name"].lower() - == ContentFileConstants.COVER_FILE_NAME.lower() - ): - cover_file_content = self.sharepoint_client.get_file_content( - sp_file["id"] - ) - cover_document_manager = DocumentManager() - file_name = sp_file["name"].lower() - created_by = self.__generate_created__by__(folder_name=folder_name) - content_directory.cover_document_id = ( - await cover_document_manager.save_document_file( - created_by, file_name, cover_file_content - ) - ) - elif ( - sp_file["name"].lower() - == ContentFileConstants.SUMMARY_FILE_NAME.lower() - ): - content_directory.summary_text = ( - self.sharepoint_client.get_file_content(sp_file["id"]) - ) - elif ( - sp_file["name"].lower() - == ContentFileConstants.TITLE_FILE_NAME.lower() - ): - content_directory.title_text = ( - self.sharepoint_client.get_file_content(sp_file["id"]) - ) - elif ( - sp_file["name"].lower() - == ContentFileConstants.CONTENT_LINK_FILE_NAME.lower() - ): - content_directory.content_link = ( - self.sharepoint_client.get_file_content(sp_file["id"]) - ) - elif ( - sp_file["name"].lower() - == ContentFileConstants.CONTENT_HTML_FILE_NAME.lower() - ): - content_directory.content_html = ( - self.sharepoint_client.get_file_content(sp_file["id"]) - ) - elif ( - sp_file["name"].lower() - == ContentFileConstants.CONTENT_TEXT_FILE_NAME.lower() - ): - content_directory.content_text = ( - self.sharepoint_client.get_file_content(sp_file["id"]) - ) - elif ( - sp_file["name"].lower() - == ContentFileConstants.CONTENT_PDF_FILE_NAME.lower() - ): - content_file_content = self.sharepoint_client.get_file_content( - sp_file["id"] - ) - content_document_manager = DocumentManager() - file_name = sp_file["name"] - created_by = self.__generate_created__by__(folder_name=folder_name) - content_directory.content_document_id = ( - await content_document_manager.save_document_file( - created_by, file_name, content_file_content - ) - ) - - folder.content_directories.append(content_directory) - - folder.udpate_time = current_time - folder.update_source = ContentSource.SHAREPOINT - folder.valid_thru = current_time + self.share_point_file_expiry - await folder.save() - - async def retrieve_directorys_for_all_folders(self): - current_time = datetime.now(timezone.utc) - folders = await ContentFolderDoc.find( - ContentFolderDoc.update_source == ContentSource.SHAREPOINT, - ContentFolderDoc.valid_thru < current_time, - ).to_list() - - for folder in folders: - await self.retrieve_directories_for_folder( - folder.folder_name, folder.region - ) + def __generate_created_by(self, folder_name: str) -> str: + return f"content-service-{folder_name.replace('/', '-').lower()}" diff --git a/apps/content/backend/document/document_manager.py b/apps/content/backend/document/document_manager.py index 7e4c3f7..be8141b 100644 --- a/apps/content/backend/document/document_manager.py +++ b/apps/content/backend/document/document_manager.py @@ -1,30 +1,37 @@ from common.config.app_settings import app_settings import httpx + class DocumentManager: def __init__(self): - self.storage_service_api_base = app_settings.CENTRAL_STORAGE_WEBAPI_URL_BASE.rstrip('/') + '/' - - async def retrieve_document_info(self, document_id:str): - api_url = self.storage_service_api_base + "retrieve_document_info/" + document_id + self.storage_service_api_base = ( + app_settings.CENTRAL_STORAGE_WEBAPI_URL_BASE.rstrip("/") + "/" + ) + + async def retrieve_document_info(self, document_id: str): + api_url = ( + self.storage_service_api_base + "retrieve_document_info/" + document_id + ) async with httpx.AsyncClient() as client: response = await client.get(api_url) return response.json() - async def retrieve_document_as_http_media(self, document_id:str): - api_url = self.storage_service_api_base + "read-document-as-http-media/" + document_id + async def retrieve_document_as_http_media(self, document_id: str): + api_url = ( + self.storage_service_api_base + "read-document-as-http-media/" + document_id + ) async with httpx.AsyncClient() as client: response = await client.get(api_url) return response.json() - async def save_document_file(self,associated_with:str, name:str,blob:bytes)->str: + async def save_document_file( + self, associated_with: str, name: str, blob: bytes + ) -> str: api_url = self.storage_service_api_base + "upload-file" - files = {'file':(name,blob)} + files = {"file": (name, blob)} + print("this is files", files) async with httpx.AsyncClient() as client: - response = await client.post(api_url, - data={ - 'associated_with':associated_with - }, - files=files - ) - return response.json()['document_id'] + response = await client.post( + api_url, data={"associated_with": associated_with}, files=files + ) + return response.json()["document_id"] diff --git a/apps/content/webapi/routes/content/retrieve_content_as_media_data.py b/apps/content/webapi/routes/content/retrieve_content_as_media_data.py index b455e09..ae6c73d 100644 --- a/apps/content/webapi/routes/content/retrieve_content_as_media_data.py +++ b/apps/content/webapi/routes/content/retrieve_content_as_media_data.py @@ -3,7 +3,6 @@ from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse from pydantic import BaseModel from backend.content.content_service import ContentService -from fastapi_cache.decorator import cache # Import the cache decorator router = APIRouter() @@ -15,10 +14,6 @@ router = APIRouter() description="retrieve content as media data which can be posted to web page.", response_description="Media data", ) -@cache( - expire=300, # Cache the result for 5 minutes - key_builder=lambda func, *args, **kwargs: f"content-media:{kwargs.get('document_id', args[0] if len(args) > 0 else '')}", -) async def retrieve_content_as_media_data(document_id: str): result = await ContentService().retrieve_content_as_media_data(document_id) return JSONResponse(content=jsonable_encoder(result)) diff --git a/apps/content/webapi/routes/content/retrieve_directories_for_folder.py b/apps/content/webapi/routes/content/retrieve_directories_for_folder.py index 80c7d20..eef4667 100644 --- a/apps/content/webapi/routes/content/retrieve_directories_for_folder.py +++ b/apps/content/webapi/routes/content/retrieve_directories_for_folder.py @@ -3,7 +3,6 @@ from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse from backend.content.content_service import ContentService from common.constants.region import UserRegion -from fastapi_cache.decorator import cache # Import the cache decorator router = APIRouter() @@ -15,10 +14,7 @@ router = APIRouter() description="retrieve directories for a folder, such as testimony, legal, etc", response_description="The list of directories under the folder", ) -@cache( - expire=300, # Cache for 300 seconds - key_builder=lambda func, *args, **kwargs: f"folder:{kwargs.get('folder_name', args[0] if len(args) > 0 else '')}:region:{kwargs.get('region', args[1] if len(args) > 1 else '')}", -) +# @cache(expire=300) # Cache results for 5 minutes async def retrieve_directories_for_folder(folder_name: str, region: UserRegion): result = await ContentService().retrieve_directories_for_folder(folder_name, region) return JSONResponse(content=jsonable_encoder(result))