Add delete-file endpoint, add remove old documents logic in refresh sharepoint logic

This commit is contained in:
Jet Li 2025-02-03 09:11:48 +00:00
parent 9aa8ee3afd
commit 81f734c19a
9 changed files with 139 additions and 5 deletions

View File

@ -4,7 +4,9 @@ from backend.business.document_manager import (
class DocumentHub: class DocumentHub:
def __init__(self, ): def __init__(
self,
):
self.document_manager = DocumentManager() self.document_manager = DocumentManager()
return return
@ -12,7 +14,9 @@ class DocumentHub:
return await self.document_manager.retrieve_document_info(document_id) return await self.document_manager.retrieve_document_info(document_id)
async def read_document_file_as_http_media_data(self, document_id: str): async def read_document_file_as_http_media_data(self, document_id: str):
return await self.document_manager.read_document_file_as_http_media_data(document_id) return await self.document_manager.read_document_file_as_http_media_data(
document_id
)
async def upload_document( async def upload_document(
self, associated_with: str, file_name: str, file_data: bytes self, associated_with: str, file_name: str, file_data: bytes
@ -25,3 +29,8 @@ class DocumentHub:
return await self.document_manager.upload_file( return await self.document_manager.upload_file(
associated_with, file_name, file_data associated_with, file_name, file_data
) )
async def delete_documents(self, document_ids: list):
for document_id in document_ids:
await self.document_manager.delete_document(document_id)
return

View File

@ -23,7 +23,6 @@ class DocumentManager:
return await self.document_service.read_document_file_as_http_media_data() return await self.document_service.read_document_file_as_http_media_data()
async def upload_file( async def upload_file(
self, associated_with: str, file_name: str, file_data: bytes self, associated_with: str, file_name: str, file_data: bytes
) -> bool: ) -> bool:
@ -40,3 +39,8 @@ class DocumentManager:
# request_id, document_id # request_id, document_id
# ) # )
# await self.proposal_store.update_latest_proposal_with_documents(request_id) # await self.proposal_store.update_latest_proposal_with_documents(request_id)
async def delete_document(self, document_id: str):
await self.document_service.load_document(document_id=document_id)
await self.document_service.remove_document_file()
return

View File

@ -77,7 +77,7 @@ class AzureBlobHandler:
app_settings.AZURE_STORAGE_DOCUMENT_API_ENDPOINT, app_settings.AZURE_STORAGE_DOCUMENT_API_ENDPOINT,
credential=app_settings.AZURE_STORAGE_DOCUMENT_API_KEY, credential=app_settings.AZURE_STORAGE_DOCUMENT_API_KEY,
) as blob_service_client: ) as blob_service_client:
blob_client = await blob_service_client.get_blob_client( blob_client = blob_service_client.get_blob_client(
container=container_name, blob=blob_name container=container_name, blob=blob_name
) )
await blob_client.delete_blob() await blob_client.delete_blob()

View File

@ -127,6 +127,7 @@ class DocumentService:
container_name=self.__get_container_name__(), container_name=self.__get_container_name__(),
blob_name=self.__get_blob_name__(), blob_name=self.__get_blob_name__(),
) )
await self.__document_doc.delete()
async def read_document_file_as_http_media_data(self) -> str: async def read_document_file_as_http_media_data(self) -> str:
self.__validate_document_doc() self.__validate_document_doc()

View File

@ -2,10 +2,12 @@ from fastapi import APIRouter
from .retrieve_document_info import router as ri_router from .retrieve_document_info import router as ri_router
from .upload_file import router as ud_router from .upload_file import router as ud_router
from .read_document_as_http_media import router as rd_router from .read_document_as_http_media import router as rd_router
from .delete_file import router as df_router
api_router = APIRouter(prefix="/central_storage") api_router = APIRouter(prefix="/central_storage")
api_router.include_router(ri_router, tags=["document"]) api_router.include_router(ri_router, tags=["document"])
api_router.include_router(ud_router, tags=["document"]) api_router.include_router(ud_router, tags=["document"])
api_router.include_router(rd_router, tags=["document"]) api_router.include_router(rd_router, tags=["document"])
api_router.include_router(df_router, tags=["document"])
websocket_router = APIRouter() websocket_router = APIRouter()

View File

@ -0,0 +1,42 @@
from fastapi import APIRouter, HTTPException, BackgroundTasks
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from backend.application.document_hub import DocumentHub
router = APIRouter()
class DeleteFileRequest(BaseModel):
document_ids: list[str]
async def background_delete_documents(document_ids: list[str]):
"""
Background task to delete documents from Azure Blob Storage.
"""
document_hub = DocumentHub()
try:
await document_hub.delete_documents(document_ids)
print(f"Successfully deleted documents: {document_ids}")
except Exception as e:
print(f"Error occurred during document deletion: {e}")
@router.delete(
"/delete-file",
summary="Queue file deletion from blob storage",
description="Queue deletion of files from Azure Blob Storage given a list of document IDs",
)
async def delete_file(request: DeleteFileRequest, background_tasks: BackgroundTasks):
"""
Queue the deletion process and respond immediately.
"""
try:
# Queue the background task for deletion
background_tasks.add_task(background_delete_documents, request.document_ids)
return JSONResponse(
content={"message": "File deletion has been queued for processing"},
status_code=202,
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,56 @@
from typing import Set
from backend.content.models import DocumentDoc
from backend.content.models import ContentFolderDoc
from backend.document.document_manager import DocumentManager
class DocumentCleaner:
@staticmethod
async def clean_up_unused_documents():
"""
Remove unused DocumentDoc records after all content_directories are updated.
"""
# Collect all valid document IDs from all content folders
valid_document_ids: Set[str] = set()
async for folder in ContentFolderDoc.find():
for directory in folder.content_directories:
valid_document_ids.update(
filter(
None,
[
directory.cover_document_id,
directory.summary_text_new,
directory.title_text_new,
directory.content_html_new,
directory.content_text_new,
directory.content_document_id,
],
)
)
print(
f"Valid document IDs (from updated content directories): {valid_document_ids}"
)
# Retrieve all document IDs from DocumentDoc
all_document_ids = set()
document_locations = {}
async for doc in DocumentDoc.find(
{"created_by": {"$regex": "^content-service-"}}
): # Fetch all documents that are created by content-service
if doc.document_id: # Ensure document_id is not None
all_document_ids.add(doc.document_id)
document_locations[doc.document_id] = doc.location
print(f"All document IDs (from DocumentDoc): {all_document_ids}")
# Find unused document IDs
unused_document_ids = all_document_ids - valid_document_ids
print(f"Unused document IDs: {unused_document_ids}")
# Remove unused DocumentDoc records
if unused_document_ids:
document_manager = DocumentManager()
await document_manager.delete_documents(unused_document_ids)
else:
print("No unused DocumentDocs to clean up.")

View File

@ -1,4 +1,5 @@
from backend.content.content_sharepoint_manager import ContentSharePointManager from backend.content.content_sharepoint_manager import ContentSharePointManager
from backend.content.document_cleaner import DocumentCleaner
from common.constants.region import UserRegion from common.constants.region import UserRegion
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from common.log.log_utils import log_entry_exit_async from common.log.log_utils import log_entry_exit_async
@ -10,10 +11,11 @@ from backend.content.constants import ContentSource
class SharePointContentRefresher: class SharePointContentRefresher:
def __init__(self) -> None: def __init__(self) -> None:
self.content_sharepoint_manager = ContentSharePointManager() self.content_sharepoint_manager = ContentSharePointManager()
self.document_cleaner = DocumentCleaner()
@log_entry_exit_async @log_entry_exit_async
async def refresh_all_in_database(self) -> None: async def refresh_all_in_database(self) -> None:
await self.content_sharepoint_manager.retrieve_directorys_for_all_folders() await self.content_sharepoint_manager.retrieve_directories_for_all_folders()
@log_entry_exit_async @log_entry_exit_async
async def refresh_selected(self) -> None: async def refresh_selected(self) -> None:
@ -37,3 +39,7 @@ class SharePointContentRefresher:
await self.content_sharepoint_manager.retrieve_directories_for_folder( await self.content_sharepoint_manager.retrieve_directories_for_folder(
folder_name=folder["folder"], region=folder["region"] folder_name=folder["folder"], region=folder["region"]
) )
# Clean up unused DocumentDocs
# This actually can be run separately and had idempotency guarantee
await self.document_cleaner.clean_up_unused_documents()

View File

@ -36,6 +36,20 @@ class DocumentManager:
) )
return response.json()["document_id"] return response.json()["document_id"]
async def delete_documents(self, document_ids: list):
api_url = self.storage_service_api_base + "delete-file"
document_ids_list = list(document_ids)
async with httpx.AsyncClient() as client:
response = await client.request(
method="DELETE",
url=api_url,
json={"document_ids": document_ids_list},
)
if response.status_code == 202:
print("Deletion request has been queued successfully.")
else:
print(f"Failed to queue deletion: {response.text}")
async def cleanup_document(self): async def cleanup_document(self):
# Corrected query with regex # Corrected query with regex
documents = await DocumentDoc.find( documents = await DocumentDoc.find(