Add delete-file endpoint, add remove old documents logic in refresh sharepoint logic
This commit is contained in:
parent
9aa8ee3afd
commit
81f734c19a
@ -4,7 +4,9 @@ from backend.business.document_manager import (
|
|||||||
|
|
||||||
|
|
||||||
class DocumentHub:
|
class DocumentHub:
|
||||||
def __init__(self, ):
|
def __init__(
|
||||||
|
self,
|
||||||
|
):
|
||||||
self.document_manager = DocumentManager()
|
self.document_manager = DocumentManager()
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -12,7 +14,9 @@ class DocumentHub:
|
|||||||
return await self.document_manager.retrieve_document_info(document_id)
|
return await self.document_manager.retrieve_document_info(document_id)
|
||||||
|
|
||||||
async def read_document_file_as_http_media_data(self, document_id: str):
|
async def read_document_file_as_http_media_data(self, document_id: str):
|
||||||
return await self.document_manager.read_document_file_as_http_media_data(document_id)
|
return await self.document_manager.read_document_file_as_http_media_data(
|
||||||
|
document_id
|
||||||
|
)
|
||||||
|
|
||||||
async def upload_document(
|
async def upload_document(
|
||||||
self, associated_with: str, file_name: str, file_data: bytes
|
self, associated_with: str, file_name: str, file_data: bytes
|
||||||
@ -25,3 +29,8 @@ class DocumentHub:
|
|||||||
return await self.document_manager.upload_file(
|
return await self.document_manager.upload_file(
|
||||||
associated_with, file_name, file_data
|
associated_with, file_name, file_data
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def delete_documents(self, document_ids: list):
|
||||||
|
for document_id in document_ids:
|
||||||
|
await self.document_manager.delete_document(document_id)
|
||||||
|
return
|
||||||
|
|||||||
@ -23,7 +23,6 @@ class DocumentManager:
|
|||||||
|
|
||||||
return await self.document_service.read_document_file_as_http_media_data()
|
return await self.document_service.read_document_file_as_http_media_data()
|
||||||
|
|
||||||
|
|
||||||
async def upload_file(
|
async def upload_file(
|
||||||
self, associated_with: str, file_name: str, file_data: bytes
|
self, associated_with: str, file_name: str, file_data: bytes
|
||||||
) -> bool:
|
) -> bool:
|
||||||
@ -40,3 +39,8 @@ class DocumentManager:
|
|||||||
# request_id, document_id
|
# request_id, document_id
|
||||||
# )
|
# )
|
||||||
# await self.proposal_store.update_latest_proposal_with_documents(request_id)
|
# await self.proposal_store.update_latest_proposal_with_documents(request_id)
|
||||||
|
|
||||||
|
async def delete_document(self, document_id: str):
|
||||||
|
await self.document_service.load_document(document_id=document_id)
|
||||||
|
await self.document_service.remove_document_file()
|
||||||
|
return
|
||||||
|
|||||||
@ -77,7 +77,7 @@ class AzureBlobHandler:
|
|||||||
app_settings.AZURE_STORAGE_DOCUMENT_API_ENDPOINT,
|
app_settings.AZURE_STORAGE_DOCUMENT_API_ENDPOINT,
|
||||||
credential=app_settings.AZURE_STORAGE_DOCUMENT_API_KEY,
|
credential=app_settings.AZURE_STORAGE_DOCUMENT_API_KEY,
|
||||||
) as blob_service_client:
|
) as blob_service_client:
|
||||||
blob_client = await blob_service_client.get_blob_client(
|
blob_client = blob_service_client.get_blob_client(
|
||||||
container=container_name, blob=blob_name
|
container=container_name, blob=blob_name
|
||||||
)
|
)
|
||||||
await blob_client.delete_blob()
|
await blob_client.delete_blob()
|
||||||
|
|||||||
@ -127,6 +127,7 @@ class DocumentService:
|
|||||||
container_name=self.__get_container_name__(),
|
container_name=self.__get_container_name__(),
|
||||||
blob_name=self.__get_blob_name__(),
|
blob_name=self.__get_blob_name__(),
|
||||||
)
|
)
|
||||||
|
await self.__document_doc.delete()
|
||||||
|
|
||||||
async def read_document_file_as_http_media_data(self) -> str:
|
async def read_document_file_as_http_media_data(self) -> str:
|
||||||
self.__validate_document_doc()
|
self.__validate_document_doc()
|
||||||
|
|||||||
@ -2,10 +2,12 @@ from fastapi import APIRouter
|
|||||||
from .retrieve_document_info import router as ri_router
|
from .retrieve_document_info import router as ri_router
|
||||||
from .upload_file import router as ud_router
|
from .upload_file import router as ud_router
|
||||||
from .read_document_as_http_media import router as rd_router
|
from .read_document_as_http_media import router as rd_router
|
||||||
|
from .delete_file import router as df_router
|
||||||
|
|
||||||
api_router = APIRouter(prefix="/central_storage")
|
api_router = APIRouter(prefix="/central_storage")
|
||||||
api_router.include_router(ri_router, tags=["document"])
|
api_router.include_router(ri_router, tags=["document"])
|
||||||
api_router.include_router(ud_router, tags=["document"])
|
api_router.include_router(ud_router, tags=["document"])
|
||||||
api_router.include_router(rd_router, tags=["document"])
|
api_router.include_router(rd_router, tags=["document"])
|
||||||
|
api_router.include_router(df_router, tags=["document"])
|
||||||
|
|
||||||
websocket_router = APIRouter()
|
websocket_router = APIRouter()
|
||||||
|
|||||||
42
apps/central_storage/webapi/routes/delete_file.py
Normal file
42
apps/central_storage/webapi/routes/delete_file.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from backend.application.document_hub import DocumentHub
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
class DeleteFileRequest(BaseModel):
|
||||||
|
document_ids: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
async def background_delete_documents(document_ids: list[str]):
|
||||||
|
"""
|
||||||
|
Background task to delete documents from Azure Blob Storage.
|
||||||
|
"""
|
||||||
|
document_hub = DocumentHub()
|
||||||
|
try:
|
||||||
|
await document_hub.delete_documents(document_ids)
|
||||||
|
print(f"Successfully deleted documents: {document_ids}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error occurred during document deletion: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete(
|
||||||
|
"/delete-file",
|
||||||
|
summary="Queue file deletion from blob storage",
|
||||||
|
description="Queue deletion of files from Azure Blob Storage given a list of document IDs",
|
||||||
|
)
|
||||||
|
async def delete_file(request: DeleteFileRequest, background_tasks: BackgroundTasks):
|
||||||
|
"""
|
||||||
|
Queue the deletion process and respond immediately.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Queue the background task for deletion
|
||||||
|
background_tasks.add_task(background_delete_documents, request.document_ids)
|
||||||
|
return JSONResponse(
|
||||||
|
content={"message": "File deletion has been queued for processing"},
|
||||||
|
status_code=202,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
56
apps/content/backend/content/document_cleaner.py
Normal file
56
apps/content/backend/content/document_cleaner.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
from typing import Set
|
||||||
|
from backend.content.models import DocumentDoc
|
||||||
|
from backend.content.models import ContentFolderDoc
|
||||||
|
from backend.document.document_manager import DocumentManager
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentCleaner:
|
||||||
|
@staticmethod
|
||||||
|
async def clean_up_unused_documents():
|
||||||
|
"""
|
||||||
|
Remove unused DocumentDoc records after all content_directories are updated.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Collect all valid document IDs from all content folders
|
||||||
|
valid_document_ids: Set[str] = set()
|
||||||
|
async for folder in ContentFolderDoc.find():
|
||||||
|
for directory in folder.content_directories:
|
||||||
|
valid_document_ids.update(
|
||||||
|
filter(
|
||||||
|
None,
|
||||||
|
[
|
||||||
|
directory.cover_document_id,
|
||||||
|
directory.summary_text_new,
|
||||||
|
directory.title_text_new,
|
||||||
|
directory.content_html_new,
|
||||||
|
directory.content_text_new,
|
||||||
|
directory.content_document_id,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Valid document IDs (from updated content directories): {valid_document_ids}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Retrieve all document IDs from DocumentDoc
|
||||||
|
all_document_ids = set()
|
||||||
|
document_locations = {}
|
||||||
|
async for doc in DocumentDoc.find(
|
||||||
|
{"created_by": {"$regex": "^content-service-"}}
|
||||||
|
): # Fetch all documents that are created by content-service
|
||||||
|
if doc.document_id: # Ensure document_id is not None
|
||||||
|
all_document_ids.add(doc.document_id)
|
||||||
|
document_locations[doc.document_id] = doc.location
|
||||||
|
print(f"All document IDs (from DocumentDoc): {all_document_ids}")
|
||||||
|
|
||||||
|
# Find unused document IDs
|
||||||
|
unused_document_ids = all_document_ids - valid_document_ids
|
||||||
|
print(f"Unused document IDs: {unused_document_ids}")
|
||||||
|
|
||||||
|
# Remove unused DocumentDoc records
|
||||||
|
if unused_document_ids:
|
||||||
|
document_manager = DocumentManager()
|
||||||
|
await document_manager.delete_documents(unused_document_ids)
|
||||||
|
else:
|
||||||
|
print("No unused DocumentDocs to clean up.")
|
||||||
@ -1,4 +1,5 @@
|
|||||||
from backend.content.content_sharepoint_manager import ContentSharePointManager
|
from backend.content.content_sharepoint_manager import ContentSharePointManager
|
||||||
|
from backend.content.document_cleaner import DocumentCleaner
|
||||||
from common.constants.region import UserRegion
|
from common.constants.region import UserRegion
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
from common.log.log_utils import log_entry_exit_async
|
from common.log.log_utils import log_entry_exit_async
|
||||||
@ -10,10 +11,11 @@ from backend.content.constants import ContentSource
|
|||||||
class SharePointContentRefresher:
|
class SharePointContentRefresher:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.content_sharepoint_manager = ContentSharePointManager()
|
self.content_sharepoint_manager = ContentSharePointManager()
|
||||||
|
self.document_cleaner = DocumentCleaner()
|
||||||
|
|
||||||
@log_entry_exit_async
|
@log_entry_exit_async
|
||||||
async def refresh_all_in_database(self) -> None:
|
async def refresh_all_in_database(self) -> None:
|
||||||
await self.content_sharepoint_manager.retrieve_directorys_for_all_folders()
|
await self.content_sharepoint_manager.retrieve_directories_for_all_folders()
|
||||||
|
|
||||||
@log_entry_exit_async
|
@log_entry_exit_async
|
||||||
async def refresh_selected(self) -> None:
|
async def refresh_selected(self) -> None:
|
||||||
@ -37,3 +39,7 @@ class SharePointContentRefresher:
|
|||||||
await self.content_sharepoint_manager.retrieve_directories_for_folder(
|
await self.content_sharepoint_manager.retrieve_directories_for_folder(
|
||||||
folder_name=folder["folder"], region=folder["region"]
|
folder_name=folder["folder"], region=folder["region"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Clean up unused DocumentDocs
|
||||||
|
# This actually can be run separately and had idempotency guarantee
|
||||||
|
await self.document_cleaner.clean_up_unused_documents()
|
||||||
|
|||||||
@ -36,6 +36,20 @@ class DocumentManager:
|
|||||||
)
|
)
|
||||||
return response.json()["document_id"]
|
return response.json()["document_id"]
|
||||||
|
|
||||||
|
async def delete_documents(self, document_ids: list):
|
||||||
|
api_url = self.storage_service_api_base + "delete-file"
|
||||||
|
document_ids_list = list(document_ids)
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
response = await client.request(
|
||||||
|
method="DELETE",
|
||||||
|
url=api_url,
|
||||||
|
json={"document_ids": document_ids_list},
|
||||||
|
)
|
||||||
|
if response.status_code == 202:
|
||||||
|
print("Deletion request has been queued successfully.")
|
||||||
|
else:
|
||||||
|
print(f"Failed to queue deletion: {response.text}")
|
||||||
|
|
||||||
async def cleanup_document(self):
|
async def cleanup_document(self):
|
||||||
# Corrected query with regex
|
# Corrected query with regex
|
||||||
documents = await DocumentDoc.find(
|
documents = await DocumentDoc.find(
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user