diff --git a/apps/central_storage/backend/application/document_hub.py b/apps/central_storage/backend/application/document_hub.py index 1fa1439..d50ac39 100644 --- a/apps/central_storage/backend/application/document_hub.py +++ b/apps/central_storage/backend/application/document_hub.py @@ -4,7 +4,9 @@ from backend.business.document_manager import ( class DocumentHub: - def __init__(self, ): + def __init__( + self, + ): self.document_manager = DocumentManager() return @@ -12,7 +14,9 @@ class DocumentHub: return await self.document_manager.retrieve_document_info(document_id) async def read_document_file_as_http_media_data(self, document_id: str): - return await self.document_manager.read_document_file_as_http_media_data(document_id) + return await self.document_manager.read_document_file_as_http_media_data( + document_id + ) async def upload_document( self, associated_with: str, file_name: str, file_data: bytes @@ -25,3 +29,8 @@ class DocumentHub: return await self.document_manager.upload_file( associated_with, file_name, file_data ) + + async def delete_documents(self, document_ids: list): + for document_id in document_ids: + await self.document_manager.delete_document(document_id) + return diff --git a/apps/central_storage/backend/business/document_manager.py b/apps/central_storage/backend/business/document_manager.py index 7c7eee1..08b212e 100644 --- a/apps/central_storage/backend/business/document_manager.py +++ b/apps/central_storage/backend/business/document_manager.py @@ -23,7 +23,6 @@ class DocumentManager: return await self.document_service.read_document_file_as_http_media_data() - async def upload_file( self, associated_with: str, file_name: str, file_data: bytes ) -> bool: @@ -40,3 +39,8 @@ class DocumentManager: # request_id, document_id # ) # await self.proposal_store.update_latest_proposal_with_documents(request_id) + + async def delete_document(self, document_id: str): + await self.document_service.load_document(document_id=document_id) + await self.document_service.remove_document_file() + return diff --git a/apps/central_storage/backend/infra/azure_storage/blob_handler.py b/apps/central_storage/backend/infra/azure_storage/blob_handler.py index 94851c4..64fa74d 100644 --- a/apps/central_storage/backend/infra/azure_storage/blob_handler.py +++ b/apps/central_storage/backend/infra/azure_storage/blob_handler.py @@ -77,7 +77,7 @@ class AzureBlobHandler: app_settings.AZURE_STORAGE_DOCUMENT_API_ENDPOINT, credential=app_settings.AZURE_STORAGE_DOCUMENT_API_KEY, ) as blob_service_client: - blob_client = await blob_service_client.get_blob_client( + blob_client = blob_service_client.get_blob_client( container=container_name, blob=blob_name ) await blob_client.delete_blob() diff --git a/apps/central_storage/backend/services/document_service.py b/apps/central_storage/backend/services/document_service.py index ec1bb6d..58ea1e1 100644 --- a/apps/central_storage/backend/services/document_service.py +++ b/apps/central_storage/backend/services/document_service.py @@ -127,6 +127,7 @@ class DocumentService: container_name=self.__get_container_name__(), blob_name=self.__get_blob_name__(), ) + await self.__document_doc.delete() async def read_document_file_as_http_media_data(self) -> str: self.__validate_document_doc() diff --git a/apps/central_storage/webapi/routes/__init__.py b/apps/central_storage/webapi/routes/__init__.py index 7b9d3b8..ba2a1c7 100644 --- a/apps/central_storage/webapi/routes/__init__.py +++ b/apps/central_storage/webapi/routes/__init__.py @@ -2,10 +2,12 @@ from fastapi import APIRouter from .retrieve_document_info import router as ri_router from .upload_file import router as ud_router from .read_document_as_http_media import router as rd_router +from .delete_file import router as df_router api_router = APIRouter(prefix="/central_storage") api_router.include_router(ri_router, tags=["document"]) api_router.include_router(ud_router, tags=["document"]) api_router.include_router(rd_router, tags=["document"]) +api_router.include_router(df_router, tags=["document"]) websocket_router = APIRouter() diff --git a/apps/central_storage/webapi/routes/delete_file.py b/apps/central_storage/webapi/routes/delete_file.py new file mode 100644 index 0000000..23f3fc8 --- /dev/null +++ b/apps/central_storage/webapi/routes/delete_file.py @@ -0,0 +1,42 @@ +from fastapi import APIRouter, HTTPException, BackgroundTasks +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from backend.application.document_hub import DocumentHub + +router = APIRouter() + + +class DeleteFileRequest(BaseModel): + document_ids: list[str] + + +async def background_delete_documents(document_ids: list[str]): + """ + Background task to delete documents from Azure Blob Storage. + """ + document_hub = DocumentHub() + try: + await document_hub.delete_documents(document_ids) + print(f"Successfully deleted documents: {document_ids}") + except Exception as e: + print(f"Error occurred during document deletion: {e}") + + +@router.delete( + "/delete-file", + summary="Queue file deletion from blob storage", + description="Queue deletion of files from Azure Blob Storage given a list of document IDs", +) +async def delete_file(request: DeleteFileRequest, background_tasks: BackgroundTasks): + """ + Queue the deletion process and respond immediately. + """ + try: + # Queue the background task for deletion + background_tasks.add_task(background_delete_documents, request.document_ids) + return JSONResponse( + content={"message": "File deletion has been queued for processing"}, + status_code=202, + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/apps/content/backend/content/document_cleaner.py b/apps/content/backend/content/document_cleaner.py new file mode 100644 index 0000000..d08f174 --- /dev/null +++ b/apps/content/backend/content/document_cleaner.py @@ -0,0 +1,56 @@ +from typing import Set +from backend.content.models import DocumentDoc +from backend.content.models import ContentFolderDoc +from backend.document.document_manager import DocumentManager + + +class DocumentCleaner: + @staticmethod + async def clean_up_unused_documents(): + """ + Remove unused DocumentDoc records after all content_directories are updated. + """ + + # Collect all valid document IDs from all content folders + valid_document_ids: Set[str] = set() + async for folder in ContentFolderDoc.find(): + for directory in folder.content_directories: + valid_document_ids.update( + filter( + None, + [ + directory.cover_document_id, + directory.summary_text_new, + directory.title_text_new, + directory.content_html_new, + directory.content_text_new, + directory.content_document_id, + ], + ) + ) + + print( + f"Valid document IDs (from updated content directories): {valid_document_ids}" + ) + + # Retrieve all document IDs from DocumentDoc + all_document_ids = set() + document_locations = {} + async for doc in DocumentDoc.find( + {"created_by": {"$regex": "^content-service-"}} + ): # Fetch all documents that are created by content-service + if doc.document_id: # Ensure document_id is not None + all_document_ids.add(doc.document_id) + document_locations[doc.document_id] = doc.location + print(f"All document IDs (from DocumentDoc): {all_document_ids}") + + # Find unused document IDs + unused_document_ids = all_document_ids - valid_document_ids + print(f"Unused document IDs: {unused_document_ids}") + + # Remove unused DocumentDoc records + if unused_document_ids: + document_manager = DocumentManager() + await document_manager.delete_documents(unused_document_ids) + else: + print("No unused DocumentDocs to clean up.") diff --git a/apps/content/backend/content/refresh_sharepoint_content.py b/apps/content/backend/content/refresh_sharepoint_content.py index 367e2c0..4d1f60d 100644 --- a/apps/content/backend/content/refresh_sharepoint_content.py +++ b/apps/content/backend/content/refresh_sharepoint_content.py @@ -1,4 +1,5 @@ from backend.content.content_sharepoint_manager import ContentSharePointManager +from backend.content.document_cleaner import DocumentCleaner from common.constants.region import UserRegion from datetime import datetime, timezone, timedelta from common.log.log_utils import log_entry_exit_async @@ -10,10 +11,11 @@ from backend.content.constants import ContentSource class SharePointContentRefresher: def __init__(self) -> None: self.content_sharepoint_manager = ContentSharePointManager() + self.document_cleaner = DocumentCleaner() @log_entry_exit_async async def refresh_all_in_database(self) -> None: - await self.content_sharepoint_manager.retrieve_directorys_for_all_folders() + await self.content_sharepoint_manager.retrieve_directories_for_all_folders() @log_entry_exit_async async def refresh_selected(self) -> None: @@ -37,3 +39,7 @@ class SharePointContentRefresher: await self.content_sharepoint_manager.retrieve_directories_for_folder( folder_name=folder["folder"], region=folder["region"] ) + + # Clean up unused DocumentDocs + # This actually can be run separately and had idempotency guarantee + await self.document_cleaner.clean_up_unused_documents() diff --git a/apps/content/backend/document/document_manager.py b/apps/content/backend/document/document_manager.py index aeb9a07..34253de 100644 --- a/apps/content/backend/document/document_manager.py +++ b/apps/content/backend/document/document_manager.py @@ -36,6 +36,20 @@ class DocumentManager: ) return response.json()["document_id"] + async def delete_documents(self, document_ids: list): + api_url = self.storage_service_api_base + "delete-file" + document_ids_list = list(document_ids) + async with httpx.AsyncClient() as client: + response = await client.request( + method="DELETE", + url=api_url, + json={"document_ids": document_ids_list}, + ) + if response.status_code == 202: + print("Deletion request has been queued successfully.") + else: + print(f"Failed to queue deletion: {response.text}") + async def cleanup_document(self): # Corrected query with regex documents = await DocumentDoc.find(