Removing cache, need further tuning

This commit is contained in:
jetli 2024-12-26 08:40:12 +00:00
parent 1972a5758b
commit 550c49f3ec
6 changed files with 142 additions and 167 deletions

View File

@ -1,3 +1,4 @@
import logging
from common.config.app_settings import app_settings
from beanie import init_beanie
from motor.motor_asyncio import AsyncIOMotorClient
@ -8,6 +9,9 @@ def register(app):
app.debug = "auth_mongo_debug"
app.title = "auth_mongo_name"
# Configure logging for pymongo
logging.getLogger("pymongo").setLevel(logging.WARNING) # Suppress DEBUG logs
@app.on_event("startup")
async def start_database():
await initiate_database()

View File

@ -1,35 +1,75 @@
from typing import Dict, List, Optional
import asyncio
from typing import List, Optional
from datetime import datetime, timedelta, timezone
from .constants import ContentSource, ContentMediaType, ContentDataFormat
from .models import ContentDirectory, ContentFolderDoc
from common.config.app_settings import app_settings
from backend.document.document_manager import DocumentManager
from common.constants.region import UserRegion
from backend.document.document_manager import DocumentManager
from .content_sharepoint_manager import ContentSharePointManager
import pytz
from backend.content.constants import ContentSource
class ContentService:
def __init__(self) -> None:
pass
self.sharepoint_manager = ContentSharePointManager()
self.expiry_time = timedelta(hours=24)
async def retrieve_directories_for_folder(
self, folder_name: str, region: UserRegion
) -> List[ContentDirectory]:
# Check cache
folder_key = f"{folder_name}/{region.name}"
folder = await ContentFolderDoc.find_one(
ContentFolderDoc.folder_name == folder_name,
ContentFolderDoc.region == region
ContentFolderDoc.folder_name == folder_key,
ContentFolderDoc.region == region,
)
if folder is None or folder.valid_thru.replace(tzinfo=timezone.utc) < datetime.now(timezone.utc):
await ContentSharePointManager().retrieve_directories_for_folder(folder_name=folder_name, region=region)
# Refresh cache if expired or not present
if folder is None or folder.valid_thru.replace(
tzinfo=timezone.utc
) < datetime.now(timezone.utc):
folder = await self.__refresh_folder_from_sharepoint(folder_name, region)
folder = await ContentFolderDoc.find_one(
ContentFolderDoc.folder_name == folder_name,
ContentFolderDoc.region == region
return folder.content_directories if folder else []
async def __refresh_folder_from_sharepoint(
self, folder_name: str, region: UserRegion
) -> ContentFolderDoc:
content_folder_name = f"{folder_name}/{region.name}"
sp_folders = self.sharepoint_manager.list_sub_folders(content_folder_name)
current_time = datetime.now(timezone.utc)
# Create or update folder metadata
folder = await ContentFolderDoc.find_one(
ContentFolderDoc.folder_name == content_folder_name
)
if folder is None:
folder = ContentFolderDoc(
folder_name=content_folder_name,
content_directories=[],
udpate_time=current_time,
update_source=ContentSource.SHAREPOINT,
valid_thru=current_time + self.expiry_time,
region=region,
)
else:
folder.content_directories.clear()
return folder.content_directories if folder else None
# Process subfolders in parallel
tasks = [
self.sharepoint_manager.process_subfolder(content_folder_name, sp_folder)
for sp_folder in sp_folders
]
try:
folder.content_directories = await asyncio.gather(*tasks)
except Exception as e:
raise RuntimeError(f"Failed to process subfolders: {e}")
# Save folder metadata
folder.udpate_time = current_time
folder.valid_thru = current_time + self.expiry_time
await folder.save()
return folder
async def retrieve_content_as_media_data(self, document_id: str) -> Optional[str]:
document_manager = DocumentManager()

View File

@ -1,14 +1,12 @@
from typing import Dict, List, Optional
from common.config.app_settings import app_settings
from datetime import datetime, timedelta, timezone
import asyncio
from datetime import datetime, timezone, timedelta
from typing import List, Dict
from backend.sharepoint.sharepoint_graph_client import SharePointGraphClient
from common.constants.region import UserRegion
from backend.document.document_manager import DocumentManager
from backend.content.constants import (
ContentSource,
ContentFileConstants,
)
from backend.content.models import ContentDirectory, ContentFolderDoc
from backend.content.constants import ContentFileConstants, ContentSource
from common.config.app_settings import app_settings
class ContentSharePointManager:
@ -25,127 +23,62 @@ class ContentSharePointManager:
)
self.share_point_file_expiry = timedelta(hours=24)
def __generate_created__by__(self, folder_name):
return "content-service-" + folder_name.replace("/", "-").lower()
def list_sub_folders(self, folder_name: str) -> List[dict]:
"""
Fetches the subfolders under the specified folder from SharePoint.
"""
try:
# Use SharePointGraphClient to list subfolders
return self.sharepoint_client.list_sub_folders(folder_name)
except Exception as e:
raise ValueError(f"Failed to list subfolders for {folder_name}: {e}")
async def retrieve_directories_for_folder(
self, folder_name: str, region: UserRegion
):
content_folder_name = folder_name + "/" + region.name
sp_folders = self.sharepoint_client.list_sub_folders(content_folder_name)
current_time = datetime.now(timezone.utc)
folder = await ContentFolderDoc.find_one(
ContentFolderDoc.folder_name == content_folder_name
async def process_subfolder(
self, content_folder_name: str, sp_folder: dict
) -> ContentDirectory:
content_directory = ContentDirectory(content_name=sp_folder["name"])
sp_files = self.sharepoint_client.list_files(
f"{content_folder_name}/{sp_folder['name']}"
)
if folder is None:
folder = ContentFolderDoc(
folder_name=folder_name,
content_directories=[],
udpate_time=current_time,
update_source=ContentSource.SHAREPOINT,
valid_thru=current_time + self.share_point_file_expiry,
region=region,
# Process files in parallel
tasks = [
self.__process_file(file, content_directory, content_folder_name)
for file in sp_files
]
await asyncio.gather(*tasks)
return content_directory
async def __process_file(
self, sp_file: dict, content_directory: ContentDirectory, folder_name: str
):
document_manager = DocumentManager()
file_content = self.sharepoint_client.get_file_content(sp_file["id"])
if sp_file["name"].lower() == ContentFileConstants.COVER_FILE_NAME.lower():
content_directory.cover_document_id = (
await document_manager.save_document_file(
self.__generate_created_by(folder_name),
sp_file["name"],
file_content,
)
)
else:
folder.content_directories.clear()
for sp_folder in sp_folders:
content_directory = ContentDirectory(
content_name=sp_folder["name"],
cover_document_id=None,
summary_text=None,
title_text=None,
content_link=None,
content_document_id=None,
elif sp_file["name"].lower() == ContentFileConstants.SUMMARY_FILE_NAME.lower():
content_directory.summary_text = file_content
elif sp_file["name"].lower() == ContentFileConstants.TITLE_FILE_NAME.lower():
content_directory.title_text = file_content
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_PDF_FILE_NAME.lower()
):
content_directory.content_document_id = (
await document_manager.save_document_file(
self.__generate_created_by(folder_name),
sp_file["name"],
file_content,
)
)
sp_files = self.sharepoint_client.list_files(
content_folder_name + "/" + sp_folder["name"]
)
for sp_file in sp_files:
if (
sp_file["name"].lower()
== ContentFileConstants.COVER_FILE_NAME.lower()
):
cover_file_content = self.sharepoint_client.get_file_content(
sp_file["id"]
)
cover_document_manager = DocumentManager()
file_name = sp_file["name"].lower()
created_by = self.__generate_created__by__(folder_name=folder_name)
content_directory.cover_document_id = (
await cover_document_manager.save_document_file(
created_by, file_name, cover_file_content
)
)
elif (
sp_file["name"].lower()
== ContentFileConstants.SUMMARY_FILE_NAME.lower()
):
content_directory.summary_text = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.TITLE_FILE_NAME.lower()
):
content_directory.title_text = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_LINK_FILE_NAME.lower()
):
content_directory.content_link = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_HTML_FILE_NAME.lower()
):
content_directory.content_html = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_TEXT_FILE_NAME.lower()
):
content_directory.content_text = (
self.sharepoint_client.get_file_content(sp_file["id"])
)
elif (
sp_file["name"].lower()
== ContentFileConstants.CONTENT_PDF_FILE_NAME.lower()
):
content_file_content = self.sharepoint_client.get_file_content(
sp_file["id"]
)
content_document_manager = DocumentManager()
file_name = sp_file["name"]
created_by = self.__generate_created__by__(folder_name=folder_name)
content_directory.content_document_id = (
await content_document_manager.save_document_file(
created_by, file_name, content_file_content
)
)
folder.content_directories.append(content_directory)
folder.udpate_time = current_time
folder.update_source = ContentSource.SHAREPOINT
folder.valid_thru = current_time + self.share_point_file_expiry
await folder.save()
async def retrieve_directorys_for_all_folders(self):
current_time = datetime.now(timezone.utc)
folders = await ContentFolderDoc.find(
ContentFolderDoc.update_source == ContentSource.SHAREPOINT,
ContentFolderDoc.valid_thru < current_time,
).to_list()
for folder in folders:
await self.retrieve_directories_for_folder(
folder.folder_name, folder.region
)
def __generate_created_by(self, folder_name: str) -> str:
return f"content-service-{folder_name.replace('/', '-').lower()}"

View File

@ -1,30 +1,37 @@
from common.config.app_settings import app_settings
import httpx
class DocumentManager:
def __init__(self):
self.storage_service_api_base = app_settings.CENTRAL_STORAGE_WEBAPI_URL_BASE.rstrip('/') + '/'
self.storage_service_api_base = (
app_settings.CENTRAL_STORAGE_WEBAPI_URL_BASE.rstrip("/") + "/"
)
async def retrieve_document_info(self, document_id:str):
api_url = self.storage_service_api_base + "retrieve_document_info/" + document_id
async def retrieve_document_info(self, document_id: str):
api_url = (
self.storage_service_api_base + "retrieve_document_info/" + document_id
)
async with httpx.AsyncClient() as client:
response = await client.get(api_url)
return response.json()
async def retrieve_document_as_http_media(self, document_id:str):
api_url = self.storage_service_api_base + "read-document-as-http-media/" + document_id
async def retrieve_document_as_http_media(self, document_id: str):
api_url = (
self.storage_service_api_base + "read-document-as-http-media/" + document_id
)
async with httpx.AsyncClient() as client:
response = await client.get(api_url)
return response.json()
async def save_document_file(self,associated_with:str, name:str,blob:bytes)->str:
async def save_document_file(
self, associated_with: str, name: str, blob: bytes
) -> str:
api_url = self.storage_service_api_base + "upload-file"
files = {'file':(name,blob)}
files = {"file": (name, blob)}
print("this is files", files)
async with httpx.AsyncClient() as client:
response = await client.post(api_url,
data={
'associated_with':associated_with
},
files=files
)
return response.json()['document_id']
response = await client.post(
api_url, data={"associated_with": associated_with}, files=files
)
return response.json()["document_id"]

View File

@ -3,7 +3,6 @@ from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from backend.content.content_service import ContentService
from fastapi_cache.decorator import cache # Import the cache decorator
router = APIRouter()
@ -15,10 +14,6 @@ router = APIRouter()
description="retrieve content as media data which can be posted to web page.",
response_description="Media data",
)
@cache(
expire=300, # Cache the result for 5 minutes
key_builder=lambda func, *args, **kwargs: f"content-media:{kwargs.get('document_id', args[0] if len(args) > 0 else '')}",
)
async def retrieve_content_as_media_data(document_id: str):
result = await ContentService().retrieve_content_as_media_data(document_id)
return JSONResponse(content=jsonable_encoder(result))

View File

@ -3,7 +3,6 @@ from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from backend.content.content_service import ContentService
from common.constants.region import UserRegion
from fastapi_cache.decorator import cache # Import the cache decorator
router = APIRouter()
@ -15,10 +14,7 @@ router = APIRouter()
description="retrieve directories for a folder, such as testimony, legal, etc",
response_description="The list of directories under the folder",
)
@cache(
expire=300, # Cache for 300 seconds
key_builder=lambda func, *args, **kwargs: f"folder:{kwargs.get('folder_name', args[0] if len(args) > 0 else '')}:region:{kwargs.get('region', args[1] if len(args) > 1 else '')}",
)
# @cache(expire=300) # Cache results for 5 minutes
async def retrieve_directories_for_folder(folder_name: str, region: UserRegion):
result = await ContentService().retrieve_directories_for_folder(folder_name, region)
return JSONResponse(content=jsonable_encoder(result))