Removing cache, need further tuning

This commit is contained in:
jetli 2024-12-26 08:40:12 +00:00
parent 1972a5758b
commit 550c49f3ec
6 changed files with 142 additions and 167 deletions

View File

@ -1,3 +1,4 @@
import logging
from common.config.app_settings import app_settings from common.config.app_settings import app_settings
from beanie import init_beanie from beanie import init_beanie
from motor.motor_asyncio import AsyncIOMotorClient from motor.motor_asyncio import AsyncIOMotorClient
@ -8,6 +9,9 @@ def register(app):
app.debug = "auth_mongo_debug" app.debug = "auth_mongo_debug"
app.title = "auth_mongo_name" app.title = "auth_mongo_name"
# Configure logging for pymongo
logging.getLogger("pymongo").setLevel(logging.WARNING) # Suppress DEBUG logs
@app.on_event("startup") @app.on_event("startup")
async def start_database(): async def start_database():
await initiate_database() await initiate_database()

View File

@ -1,35 +1,75 @@
from typing import Dict, List, Optional import asyncio
from typing import List, Optional
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from .constants import ContentSource, ContentMediaType, ContentDataFormat
from .models import ContentDirectory, ContentFolderDoc from .models import ContentDirectory, ContentFolderDoc
from common.config.app_settings import app_settings
from backend.document.document_manager import DocumentManager
from common.constants.region import UserRegion from common.constants.region import UserRegion
from backend.document.document_manager import DocumentManager
from .content_sharepoint_manager import ContentSharePointManager from .content_sharepoint_manager import ContentSharePointManager
import pytz from backend.content.constants import ContentSource
class ContentService: class ContentService:
def __init__(self) -> None: def __init__(self) -> None:
pass self.sharepoint_manager = ContentSharePointManager()
self.expiry_time = timedelta(hours=24)
async def retrieve_directories_for_folder( async def retrieve_directories_for_folder(
self, folder_name: str, region: UserRegion self, folder_name: str, region: UserRegion
) -> List[ContentDirectory]: ) -> List[ContentDirectory]:
# Check cache
folder_key = f"{folder_name}/{region.name}"
folder = await ContentFolderDoc.find_one( folder = await ContentFolderDoc.find_one(
ContentFolderDoc.folder_name == folder_name, ContentFolderDoc.folder_name == folder_key,
ContentFolderDoc.region == region ContentFolderDoc.region == region,
) )
if folder is None or folder.valid_thru.replace(tzinfo=timezone.utc) < datetime.now(timezone.utc): # Refresh cache if expired or not present
await ContentSharePointManager().retrieve_directories_for_folder(folder_name=folder_name, region=region) if folder is None or folder.valid_thru.replace(
tzinfo=timezone.utc
) < datetime.now(timezone.utc):
folder = await self.__refresh_folder_from_sharepoint(folder_name, region)
return folder.content_directories if folder else []
async def __refresh_folder_from_sharepoint(
self, folder_name: str, region: UserRegion
) -> ContentFolderDoc:
content_folder_name = f"{folder_name}/{region.name}"
sp_folders = self.sharepoint_manager.list_sub_folders(content_folder_name)
current_time = datetime.now(timezone.utc)
# Create or update folder metadata
folder = await ContentFolderDoc.find_one( folder = await ContentFolderDoc.find_one(
ContentFolderDoc.folder_name == folder_name, ContentFolderDoc.folder_name == content_folder_name
ContentFolderDoc.region == region
) )
if folder is None:
folder = ContentFolderDoc(
folder_name=content_folder_name,
content_directories=[],
udpate_time=current_time,
update_source=ContentSource.SHAREPOINT,
valid_thru=current_time + self.expiry_time,
region=region,
)
else:
folder.content_directories.clear()
return folder.content_directories if folder else None # Process subfolders in parallel
tasks = [
self.sharepoint_manager.process_subfolder(content_folder_name, sp_folder)
for sp_folder in sp_folders
]
try:
folder.content_directories = await asyncio.gather(*tasks)
except Exception as e:
raise RuntimeError(f"Failed to process subfolders: {e}")
# Save folder metadata
folder.udpate_time = current_time
folder.valid_thru = current_time + self.expiry_time
await folder.save()
return folder
async def retrieve_content_as_media_data(self, document_id: str) -> Optional[str]: async def retrieve_content_as_media_data(self, document_id: str) -> Optional[str]:
document_manager = DocumentManager() document_manager = DocumentManager()

View File

@ -1,14 +1,12 @@
from typing import Dict, List, Optional import asyncio
from common.config.app_settings import app_settings from datetime import datetime, timezone, timedelta
from datetime import datetime, timedelta, timezone from typing import List, Dict
from backend.sharepoint.sharepoint_graph_client import SharePointGraphClient from backend.sharepoint.sharepoint_graph_client import SharePointGraphClient
from common.constants.region import UserRegion
from backend.document.document_manager import DocumentManager from backend.document.document_manager import DocumentManager
from backend.content.constants import (
ContentSource,
ContentFileConstants,
)
from backend.content.models import ContentDirectory, ContentFolderDoc from backend.content.models import ContentDirectory, ContentFolderDoc
from backend.content.constants import ContentFileConstants, ContentSource
from common.config.app_settings import app_settings
class ContentSharePointManager: class ContentSharePointManager:
@ -25,127 +23,62 @@ class ContentSharePointManager:
) )
self.share_point_file_expiry = timedelta(hours=24) self.share_point_file_expiry = timedelta(hours=24)
def __generate_created__by__(self, folder_name): def list_sub_folders(self, folder_name: str) -> List[dict]:
return "content-service-" + folder_name.replace("/", "-").lower() """
Fetches the subfolders under the specified folder from SharePoint.
"""
try:
# Use SharePointGraphClient to list subfolders
return self.sharepoint_client.list_sub_folders(folder_name)
except Exception as e:
raise ValueError(f"Failed to list subfolders for {folder_name}: {e}")
async def retrieve_directories_for_folder( async def process_subfolder(
self, folder_name: str, region: UserRegion self, content_folder_name: str, sp_folder: dict
): ) -> ContentDirectory:
content_folder_name = folder_name + "/" + region.name content_directory = ContentDirectory(content_name=sp_folder["name"])
sp_folders = self.sharepoint_client.list_sub_folders(content_folder_name)
current_time = datetime.now(timezone.utc)
folder = await ContentFolderDoc.find_one(
ContentFolderDoc.folder_name == content_folder_name
)
if folder is None:
folder = ContentFolderDoc(
folder_name=folder_name,
content_directories=[],
udpate_time=current_time,
update_source=ContentSource.SHAREPOINT,
valid_thru=current_time + self.share_point_file_expiry,
region=region,
)
else:
folder.content_directories.clear()
for sp_folder in sp_folders:
content_directory = ContentDirectory(
content_name=sp_folder["name"],
cover_document_id=None,
summary_text=None,
title_text=None,
content_link=None,
content_document_id=None,
)
sp_files = self.sharepoint_client.list_files( sp_files = self.sharepoint_client.list_files(
content_folder_name + "/" + sp_folder["name"] f"{content_folder_name}/{sp_folder['name']}"
)
for sp_file in sp_files:
if (
sp_file["name"].lower()
== ContentFileConstants.COVER_FILE_NAME.lower()
):
cover_file_content = self.sharepoint_client.get_file_content(
sp_file["id"]
)
cover_document_manager = DocumentManager()
file_name = sp_file["name"].lower()
created_by = self.__generate_created__by__(folder_name=folder_name)
content_directory.cover_document_id = (
await cover_document_manager.save_document_file(
created_by, file_name, cover_file_content
)
) )
elif ( # Process files in parallel
sp_file["name"].lower() tasks = [
== ContentFileConstants.SUMMARY_FILE_NAME.lower() self.__process_file(file, content_directory, content_folder_name)
for file in sp_files
]
await asyncio.gather(*tasks)
return content_directory
async def __process_file(
self, sp_file: dict, content_directory: ContentDirectory, folder_name: str
): ):
content_directory.summary_text = ( document_manager = DocumentManager()
self.sharepoint_client.get_file_content(sp_file["id"]) file_content = self.sharepoint_client.get_file_content(sp_file["id"])
if sp_file["name"].lower() == ContentFileConstants.COVER_FILE_NAME.lower():
content_directory.cover_document_id = (
await document_manager.save_document_file(
self.__generate_created_by(folder_name),
sp_file["name"],
file_content,
) )
elif (
sp_file["name"].lower()
== ContentFileConstants.TITLE_FILE_NAME.lower()
):
content_directory.title_text = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_LINK_FILE_NAME.lower()
):
content_directory.content_link = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_HTML_FILE_NAME.lower()
):
content_directory.content_html = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_TEXT_FILE_NAME.lower()
):
content_directory.content_text = (
self.sharepoint_client.get_file_content(sp_file["id"])
) )
elif sp_file["name"].lower() == ContentFileConstants.SUMMARY_FILE_NAME.lower():
content_directory.summary_text = file_content
elif sp_file["name"].lower() == ContentFileConstants.TITLE_FILE_NAME.lower():
content_directory.title_text = file_content
elif ( elif (
sp_file["name"].lower() sp_file["name"].lower()
== ContentFileConstants.CONTENT_PDF_FILE_NAME.lower() == ContentFileConstants.CONTENT_PDF_FILE_NAME.lower()
): ):
content_file_content = self.sharepoint_client.get_file_content(
sp_file["id"]
)
content_document_manager = DocumentManager()
file_name = sp_file["name"]
created_by = self.__generate_created__by__(folder_name=folder_name)
content_directory.content_document_id = ( content_directory.content_document_id = (
await content_document_manager.save_document_file( await document_manager.save_document_file(
created_by, file_name, content_file_content self.__generate_created_by(folder_name),
sp_file["name"],
file_content,
) )
) )
folder.content_directories.append(content_directory) def __generate_created_by(self, folder_name: str) -> str:
return f"content-service-{folder_name.replace('/', '-').lower()}"
folder.udpate_time = current_time
folder.update_source = ContentSource.SHAREPOINT
folder.valid_thru = current_time + self.share_point_file_expiry
await folder.save()
async def retrieve_directorys_for_all_folders(self):
current_time = datetime.now(timezone.utc)
folders = await ContentFolderDoc.find(
ContentFolderDoc.update_source == ContentSource.SHAREPOINT,
ContentFolderDoc.valid_thru < current_time,
).to_list()
for folder in folders:
await self.retrieve_directories_for_folder(
folder.folder_name, folder.region
)

View File

@ -1,30 +1,37 @@
from common.config.app_settings import app_settings from common.config.app_settings import app_settings
import httpx import httpx
class DocumentManager: class DocumentManager:
def __init__(self): def __init__(self):
self.storage_service_api_base = app_settings.CENTRAL_STORAGE_WEBAPI_URL_BASE.rstrip('/') + '/' self.storage_service_api_base = (
app_settings.CENTRAL_STORAGE_WEBAPI_URL_BASE.rstrip("/") + "/"
async def retrieve_document_info(self, document_id:str):
api_url = self.storage_service_api_base + "retrieve_document_info/" + document_id
async with httpx.AsyncClient() as client:
response = await client.get(api_url)
return response.json()
async def retrieve_document_as_http_media(self, document_id:str):
api_url = self.storage_service_api_base + "read-document-as-http-media/" + document_id
async with httpx.AsyncClient() as client:
response = await client.get(api_url)
return response.json()
async def save_document_file(self,associated_with:str, name:str,blob:bytes)->str:
api_url = self.storage_service_api_base + "upload-file"
files = {'file':(name,blob)}
async with httpx.AsyncClient() as client:
response = await client.post(api_url,
data={
'associated_with':associated_with
},
files=files
) )
return response.json()['document_id']
async def retrieve_document_info(self, document_id: str):
api_url = (
self.storage_service_api_base + "retrieve_document_info/" + document_id
)
async with httpx.AsyncClient() as client:
response = await client.get(api_url)
return response.json()
async def retrieve_document_as_http_media(self, document_id: str):
api_url = (
self.storage_service_api_base + "read-document-as-http-media/" + document_id
)
async with httpx.AsyncClient() as client:
response = await client.get(api_url)
return response.json()
async def save_document_file(
self, associated_with: str, name: str, blob: bytes
) -> str:
api_url = self.storage_service_api_base + "upload-file"
files = {"file": (name, blob)}
print("this is files", files)
async with httpx.AsyncClient() as client:
response = await client.post(
api_url, data={"associated_with": associated_with}, files=files
)
return response.json()["document_id"]

View File

@ -3,7 +3,6 @@ from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from pydantic import BaseModel from pydantic import BaseModel
from backend.content.content_service import ContentService from backend.content.content_service import ContentService
from fastapi_cache.decorator import cache # Import the cache decorator
router = APIRouter() router = APIRouter()
@ -15,10 +14,6 @@ router = APIRouter()
description="retrieve content as media data which can be posted to web page.", description="retrieve content as media data which can be posted to web page.",
response_description="Media data", response_description="Media data",
) )
@cache(
expire=300, # Cache the result for 5 minutes
key_builder=lambda func, *args, **kwargs: f"content-media:{kwargs.get('document_id', args[0] if len(args) > 0 else '')}",
)
async def retrieve_content_as_media_data(document_id: str): async def retrieve_content_as_media_data(document_id: str):
result = await ContentService().retrieve_content_as_media_data(document_id) result = await ContentService().retrieve_content_as_media_data(document_id)
return JSONResponse(content=jsonable_encoder(result)) return JSONResponse(content=jsonable_encoder(result))

View File

@ -3,7 +3,6 @@ from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from backend.content.content_service import ContentService from backend.content.content_service import ContentService
from common.constants.region import UserRegion from common.constants.region import UserRegion
from fastapi_cache.decorator import cache # Import the cache decorator
router = APIRouter() router = APIRouter()
@ -15,10 +14,7 @@ router = APIRouter()
description="retrieve directories for a folder, such as testimony, legal, etc", description="retrieve directories for a folder, such as testimony, legal, etc",
response_description="The list of directories under the folder", response_description="The list of directories under the folder",
) )
@cache( # @cache(expire=300) # Cache results for 5 minutes
expire=300, # Cache for 300 seconds
key_builder=lambda func, *args, **kwargs: f"folder:{kwargs.get('folder_name', args[0] if len(args) > 0 else '')}:region:{kwargs.get('region', args[1] if len(args) > 1 else '')}",
)
async def retrieve_directories_for_folder(folder_name: str, region: UserRegion): async def retrieve_directories_for_folder(folder_name: str, region: UserRegion):
result = await ContentService().retrieve_directories_for_folder(folder_name, region) result = await ContentService().retrieve_directories_for_folder(folder_name, region)
return JSONResponse(content=jsonable_encoder(result)) return JSONResponse(content=jsonable_encoder(result))