Add delete-file endpoint, add remove old documents logic in refresh sharepoint logic
This commit is contained in:
parent
9aa8ee3afd
commit
81f734c19a
@ -4,7 +4,9 @@ from backend.business.document_manager import (
|
||||
|
||||
|
||||
class DocumentHub:
|
||||
def __init__(self, ):
|
||||
def __init__(
|
||||
self,
|
||||
):
|
||||
self.document_manager = DocumentManager()
|
||||
return
|
||||
|
||||
@ -12,7 +14,9 @@ class DocumentHub:
|
||||
return await self.document_manager.retrieve_document_info(document_id)
|
||||
|
||||
async def read_document_file_as_http_media_data(self, document_id: str):
|
||||
return await self.document_manager.read_document_file_as_http_media_data(document_id)
|
||||
return await self.document_manager.read_document_file_as_http_media_data(
|
||||
document_id
|
||||
)
|
||||
|
||||
async def upload_document(
|
||||
self, associated_with: str, file_name: str, file_data: bytes
|
||||
@ -25,3 +29,8 @@ class DocumentHub:
|
||||
return await self.document_manager.upload_file(
|
||||
associated_with, file_name, file_data
|
||||
)
|
||||
|
||||
async def delete_documents(self, document_ids: list):
|
||||
for document_id in document_ids:
|
||||
await self.document_manager.delete_document(document_id)
|
||||
return
|
||||
|
||||
@ -23,7 +23,6 @@ class DocumentManager:
|
||||
|
||||
return await self.document_service.read_document_file_as_http_media_data()
|
||||
|
||||
|
||||
async def upload_file(
|
||||
self, associated_with: str, file_name: str, file_data: bytes
|
||||
) -> bool:
|
||||
@ -40,3 +39,8 @@ class DocumentManager:
|
||||
# request_id, document_id
|
||||
# )
|
||||
# await self.proposal_store.update_latest_proposal_with_documents(request_id)
|
||||
|
||||
async def delete_document(self, document_id: str):
|
||||
await self.document_service.load_document(document_id=document_id)
|
||||
await self.document_service.remove_document_file()
|
||||
return
|
||||
|
||||
@ -77,7 +77,7 @@ class AzureBlobHandler:
|
||||
app_settings.AZURE_STORAGE_DOCUMENT_API_ENDPOINT,
|
||||
credential=app_settings.AZURE_STORAGE_DOCUMENT_API_KEY,
|
||||
) as blob_service_client:
|
||||
blob_client = await blob_service_client.get_blob_client(
|
||||
blob_client = blob_service_client.get_blob_client(
|
||||
container=container_name, blob=blob_name
|
||||
)
|
||||
await blob_client.delete_blob()
|
||||
|
||||
@ -127,6 +127,7 @@ class DocumentService:
|
||||
container_name=self.__get_container_name__(),
|
||||
blob_name=self.__get_blob_name__(),
|
||||
)
|
||||
await self.__document_doc.delete()
|
||||
|
||||
async def read_document_file_as_http_media_data(self) -> str:
|
||||
self.__validate_document_doc()
|
||||
|
||||
@ -2,10 +2,12 @@ from fastapi import APIRouter
|
||||
from .retrieve_document_info import router as ri_router
|
||||
from .upload_file import router as ud_router
|
||||
from .read_document_as_http_media import router as rd_router
|
||||
from .delete_file import router as df_router
|
||||
|
||||
api_router = APIRouter(prefix="/central_storage")
|
||||
api_router.include_router(ri_router, tags=["document"])
|
||||
api_router.include_router(ud_router, tags=["document"])
|
||||
api_router.include_router(rd_router, tags=["document"])
|
||||
api_router.include_router(df_router, tags=["document"])
|
||||
|
||||
websocket_router = APIRouter()
|
||||
|
||||
42
apps/central_storage/webapi/routes/delete_file.py
Normal file
42
apps/central_storage/webapi/routes/delete_file.py
Normal file
@ -0,0 +1,42 @@
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
from backend.application.document_hub import DocumentHub
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class DeleteFileRequest(BaseModel):
|
||||
document_ids: list[str]
|
||||
|
||||
|
||||
async def background_delete_documents(document_ids: list[str]):
|
||||
"""
|
||||
Background task to delete documents from Azure Blob Storage.
|
||||
"""
|
||||
document_hub = DocumentHub()
|
||||
try:
|
||||
await document_hub.delete_documents(document_ids)
|
||||
print(f"Successfully deleted documents: {document_ids}")
|
||||
except Exception as e:
|
||||
print(f"Error occurred during document deletion: {e}")
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/delete-file",
|
||||
summary="Queue file deletion from blob storage",
|
||||
description="Queue deletion of files from Azure Blob Storage given a list of document IDs",
|
||||
)
|
||||
async def delete_file(request: DeleteFileRequest, background_tasks: BackgroundTasks):
|
||||
"""
|
||||
Queue the deletion process and respond immediately.
|
||||
"""
|
||||
try:
|
||||
# Queue the background task for deletion
|
||||
background_tasks.add_task(background_delete_documents, request.document_ids)
|
||||
return JSONResponse(
|
||||
content={"message": "File deletion has been queued for processing"},
|
||||
status_code=202,
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
56
apps/content/backend/content/document_cleaner.py
Normal file
56
apps/content/backend/content/document_cleaner.py
Normal file
@ -0,0 +1,56 @@
|
||||
from typing import Set
|
||||
from backend.content.models import DocumentDoc
|
||||
from backend.content.models import ContentFolderDoc
|
||||
from backend.document.document_manager import DocumentManager
|
||||
|
||||
|
||||
class DocumentCleaner:
|
||||
@staticmethod
|
||||
async def clean_up_unused_documents():
|
||||
"""
|
||||
Remove unused DocumentDoc records after all content_directories are updated.
|
||||
"""
|
||||
|
||||
# Collect all valid document IDs from all content folders
|
||||
valid_document_ids: Set[str] = set()
|
||||
async for folder in ContentFolderDoc.find():
|
||||
for directory in folder.content_directories:
|
||||
valid_document_ids.update(
|
||||
filter(
|
||||
None,
|
||||
[
|
||||
directory.cover_document_id,
|
||||
directory.summary_text_new,
|
||||
directory.title_text_new,
|
||||
directory.content_html_new,
|
||||
directory.content_text_new,
|
||||
directory.content_document_id,
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
print(
|
||||
f"Valid document IDs (from updated content directories): {valid_document_ids}"
|
||||
)
|
||||
|
||||
# Retrieve all document IDs from DocumentDoc
|
||||
all_document_ids = set()
|
||||
document_locations = {}
|
||||
async for doc in DocumentDoc.find(
|
||||
{"created_by": {"$regex": "^content-service-"}}
|
||||
): # Fetch all documents that are created by content-service
|
||||
if doc.document_id: # Ensure document_id is not None
|
||||
all_document_ids.add(doc.document_id)
|
||||
document_locations[doc.document_id] = doc.location
|
||||
print(f"All document IDs (from DocumentDoc): {all_document_ids}")
|
||||
|
||||
# Find unused document IDs
|
||||
unused_document_ids = all_document_ids - valid_document_ids
|
||||
print(f"Unused document IDs: {unused_document_ids}")
|
||||
|
||||
# Remove unused DocumentDoc records
|
||||
if unused_document_ids:
|
||||
document_manager = DocumentManager()
|
||||
await document_manager.delete_documents(unused_document_ids)
|
||||
else:
|
||||
print("No unused DocumentDocs to clean up.")
|
||||
@ -1,4 +1,5 @@
|
||||
from backend.content.content_sharepoint_manager import ContentSharePointManager
|
||||
from backend.content.document_cleaner import DocumentCleaner
|
||||
from common.constants.region import UserRegion
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from common.log.log_utils import log_entry_exit_async
|
||||
@ -10,10 +11,11 @@ from backend.content.constants import ContentSource
|
||||
class SharePointContentRefresher:
|
||||
def __init__(self) -> None:
|
||||
self.content_sharepoint_manager = ContentSharePointManager()
|
||||
self.document_cleaner = DocumentCleaner()
|
||||
|
||||
@log_entry_exit_async
|
||||
async def refresh_all_in_database(self) -> None:
|
||||
await self.content_sharepoint_manager.retrieve_directorys_for_all_folders()
|
||||
await self.content_sharepoint_manager.retrieve_directories_for_all_folders()
|
||||
|
||||
@log_entry_exit_async
|
||||
async def refresh_selected(self) -> None:
|
||||
@ -37,3 +39,7 @@ class SharePointContentRefresher:
|
||||
await self.content_sharepoint_manager.retrieve_directories_for_folder(
|
||||
folder_name=folder["folder"], region=folder["region"]
|
||||
)
|
||||
|
||||
# Clean up unused DocumentDocs
|
||||
# This actually can be run separately and had idempotency guarantee
|
||||
await self.document_cleaner.clean_up_unused_documents()
|
||||
|
||||
@ -36,6 +36,20 @@ class DocumentManager:
|
||||
)
|
||||
return response.json()["document_id"]
|
||||
|
||||
async def delete_documents(self, document_ids: list):
|
||||
api_url = self.storage_service_api_base + "delete-file"
|
||||
document_ids_list = list(document_ids)
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.request(
|
||||
method="DELETE",
|
||||
url=api_url,
|
||||
json={"document_ids": document_ids_list},
|
||||
)
|
||||
if response.status_code == 202:
|
||||
print("Deletion request has been queued successfully.")
|
||||
else:
|
||||
print(f"Failed to queue deletion: {response.text}")
|
||||
|
||||
async def cleanup_document(self):
|
||||
# Corrected query with regex
|
||||
documents = await DocumentDoc.find(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user