from datetime import datetime, timedelta from typing import Set from backend.content.models import DocumentDoc from backend.content.models import ContentFolderDoc from backend.document.document_manager import DocumentManager class DocumentCleaner: @staticmethod async def clean_up_unused_documents(): """ Remove unused DocumentDoc records after all content_directories are updated. """ # Collect all valid document IDs from all content folders valid_document_ids: Set[str] = set() async for folder in ContentFolderDoc.find(): for directory in folder.content_directories: valid_document_ids.update( filter( None, [ directory.cover_document_id, directory.summary_text_new, directory.title_text_new, directory.content_html_new, directory.content_text_new, directory.content_document_id, ], ) ) print( f"Valid document IDs (from updated content directories): {valid_document_ids}" ) # Calculate the time 1 day ago time_1_day_ago = datetime.now(datetime.UTC) - timedelta(days=1) # Retrieve all document IDs from DocumentDoc all_document_ids = set() async for doc in DocumentDoc.find( { "created_by": {"$regex": "^content-service-"}, "create_time": {"$lt": time_1_day_ago}, } ): # Fetch all documents that are created by content-service if doc.document_id: # Ensure document_id is not None all_document_ids.add(doc.document_id) print(f"All document IDs (from DocumentDoc): {all_document_ids}") # Find unused document IDs unused_document_ids = all_document_ids - valid_document_ids print(f"Unused document IDs: {unused_document_ids}") # Remove unused DocumentDoc records if unused_document_ids: document_manager = DocumentManager() await document_manager.delete_documents(unused_document_ids) else: print("No unused DocumentDocs to clean up.")