55 lines
2.1 KiB
Python
55 lines
2.1 KiB
Python
from typing import Set
|
|
from backend.content.models import DocumentDoc
|
|
from backend.content.models import ContentFolderDoc
|
|
from backend.document.document_manager import DocumentManager
|
|
|
|
|
|
class DocumentCleaner:
|
|
@staticmethod
|
|
async def clean_up_unused_documents():
|
|
"""
|
|
Remove unused DocumentDoc records after all content_directories are updated.
|
|
"""
|
|
|
|
# Collect all valid document IDs from all content folders
|
|
valid_document_ids: Set[str] = set()
|
|
async for folder in ContentFolderDoc.find():
|
|
for directory in folder.content_directories:
|
|
valid_document_ids.update(
|
|
filter(
|
|
None,
|
|
[
|
|
directory.cover_document_id,
|
|
directory.summary_text_new,
|
|
directory.title_text_new,
|
|
directory.content_html_new,
|
|
directory.content_text_new,
|
|
directory.content_document_id,
|
|
],
|
|
)
|
|
)
|
|
|
|
print(
|
|
f"Valid document IDs (from updated content directories): {valid_document_ids}"
|
|
)
|
|
|
|
# Retrieve all document IDs from DocumentDoc
|
|
all_document_ids = set()
|
|
async for doc in DocumentDoc.find(
|
|
{"created_by": {"$regex": "^content-service-"}}
|
|
): # Fetch all documents that are created by content-service
|
|
if doc.document_id: # Ensure document_id is not None
|
|
all_document_ids.add(doc.document_id)
|
|
print(f"All document IDs (from DocumentDoc): {all_document_ids}")
|
|
|
|
# Find unused document IDs
|
|
unused_document_ids = all_document_ids - valid_document_ids
|
|
print(f"Unused document IDs: {unused_document_ids}")
|
|
|
|
# Remove unused DocumentDoc records
|
|
if unused_document_ids:
|
|
document_manager = DocumentManager()
|
|
await document_manager.delete_documents(unused_document_ids)
|
|
else:
|
|
print("No unused DocumentDocs to clean up.")
|