Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 268 additions & 13 deletions lightrag/api/routers/document_routes.py

Large diffs are not rendered by default.

182 changes: 163 additions & 19 deletions lightrag/api/routers/query_routes.py

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions lightrag/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,12 @@ class QueryParam:
Default is True to enable reranking when rerank model is available.
"""

include_metadata: bool = False
"""If True, retrieves document metadata for each chunk using the full_doc_id.
Metadata is looked up on-demand from document storage. This allows queries to
include document-level metadata without storing it with every chunk.
"""

include_references: bool = False
"""If True, includes reference list in the response for supported endpoints.
This parameter controls whether the API response includes a references field
Expand Down
167 changes: 133 additions & 34 deletions lightrag/lightrag.py

Large diffs are not rendered by default.

51 changes: 46 additions & 5 deletions lightrag/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3092,6 +3092,7 @@ async def kg_query(
hashing_kv: BaseKVStorage | None = None,
system_prompt: str | None = None,
chunks_vdb: BaseVectorStorage = None,
doc_status_storage: BaseKVStorage | None = None,
) -> QueryResult | None:
"""
Execute knowledge graph query and return unified QueryResult object.
Expand All @@ -3107,6 +3108,7 @@ async def kg_query(
hashing_kv: Cache storage
system_prompt: System prompt
chunks_vdb: Document chunks vector database
doc_status_storage: Document status storage for metadata retrieval (optional)

Returns:
QueryResult | None: Unified query result object containing:
Expand Down Expand Up @@ -3166,6 +3168,7 @@ async def kg_query(
text_chunks_db,
query_param,
chunks_vdb,
doc_status_storage,
)

if context_result is None:
Expand Down Expand Up @@ -3475,6 +3478,7 @@ async def _get_vector_context(
"content": result["content"],
"created_at": result.get("created_at", None),
"file_path": result.get("file_path", "unknown_source"),
"full_doc_id": result.get("full_doc_id"),
"source_type": "vector", # Mark the source type
"chunk_id": result.get("id"), # Add chunk_id for deduplication
}
Expand Down Expand Up @@ -3983,21 +3987,36 @@ async def _build_context_str(
chunk_tracking: dict = None,
entity_id_to_original: dict = None,
relation_id_to_original: dict = None,
doc_status_storage: BaseKVStorage | None = None,
) -> tuple[str, dict[str, Any]]:
"""
Build the final LLM context string with token processing.
This includes dynamic token calculation and final chunk truncation.

Args:
entities_context: List of entity context dicts
relations_context: List of relation context dicts
merged_chunks: List of merged chunk dicts
query: Query string
query_param: Query parameters (includes include_metadata flag)
global_config: Global configuration
chunk_tracking: Chunk tracking information
entity_id_to_original: Mapping from entity IDs to original data
relation_id_to_original: Mapping from relation IDs to original data
doc_status_storage: Document status storage for metadata lookup
"""
tokenizer = global_config.get("tokenizer")
if not tokenizer:
logger.error("Missing tokenizer, cannot build LLM context")
# Return empty raw data structure when no tokenizer
empty_raw_data = convert_to_user_format(
empty_raw_data = await convert_to_user_format(
[],
[],
[],
[],
query_param.mode,
doc_status_storage=doc_status_storage,
include_metadata=query_param.include_metadata,
)
empty_raw_data["status"] = "failure"
empty_raw_data["message"] = "Missing tokenizer, cannot build LLM context."
Expand Down Expand Up @@ -4100,12 +4119,14 @@ async def _build_context_str(
# not necessary to use LLM to generate a response
if not entities_context and not relations_context and not chunks_context:
# Return empty raw data structure when no entities/relations
empty_raw_data = convert_to_user_format(
empty_raw_data = await convert_to_user_format(
[],
[],
[],
[],
query_param.mode,
doc_status_storage=doc_status_storage,
include_metadata=query_param.include_metadata,
)
empty_raw_data["status"] = "failure"
empty_raw_data["message"] = "Query returned empty dataset."
Expand Down Expand Up @@ -4140,14 +4161,16 @@ async def _build_context_str(
logger.debug(
f"[_build_context_str] Converting to user format: {len(entities_context)} entities, {len(relations_context)} relations, {len(truncated_chunks)} chunks"
)
final_data = convert_to_user_format(
final_data = await convert_to_user_format(
entities_context,
relations_context,
truncated_chunks,
reference_list,
query_param.mode,
entity_id_to_original,
relation_id_to_original,
doc_status_storage=doc_status_storage,
include_metadata=query_param.include_metadata,
)
logger.debug(
f"[_build_context_str] Final data after conversion: {len(final_data.get('entities', []))} entities, {len(final_data.get('relationships', []))} relationships, {len(final_data.get('chunks', []))} chunks"
Expand All @@ -4166,12 +4189,25 @@ async def _build_query_context(
text_chunks_db: BaseKVStorage,
query_param: QueryParam,
chunks_vdb: BaseVectorStorage = None,
doc_status_storage: BaseKVStorage | None = None,
) -> QueryContextResult | None:
"""
Main query context building function using the new 4-stage architecture:
1. Search -> 2. Truncate -> 3. Merge chunks -> 4. Build LLM context

Returns unified QueryContextResult containing both context and raw_data.

Args:
query: Query string
ll_keywords: Low-level keywords
hl_keywords: High-level keywords
knowledge_graph_inst: Knowledge graph storage
entities_vdb: Entity vector database
relationships_vdb: Relationship vector database
text_chunks_db: Text chunks storage
query_param: Query parameters (includes include_metadata flag)
chunks_vdb: Document chunks vector database
doc_status_storage: Document status storage for metadata lookup
"""

if not query:
Expand Down Expand Up @@ -4238,6 +4274,7 @@ async def _build_query_context(
chunk_tracking=search_result["chunk_tracking"],
entity_id_to_original=truncation_result["entity_id_to_original"],
relation_id_to_original=truncation_result["relation_id_to_original"],
doc_status_storage=doc_status_storage,
)

# Convert keywords strings to lists and add complete metadata to raw_data
Expand Down Expand Up @@ -4877,17 +4914,19 @@ async def naive_query(
global_config: dict[str, str],
hashing_kv: BaseKVStorage | None = None,
system_prompt: str | None = None,
doc_status_storage: BaseKVStorage | None = None,
) -> QueryResult | None:
"""
Execute naive query and return unified QueryResult object.

Args:
query: Query string
chunks_vdb: Document chunks vector database
query_param: Query parameters
query_param: Query parameters (includes include_metadata flag)
global_config: Global configuration
hashing_kv: Cache storage
system_prompt: System prompt
doc_status_storage: Document status storage for metadata lookup (optional)

Returns:
QueryResult | None: Unified query result object containing:
Expand Down Expand Up @@ -4979,12 +5018,14 @@ async def naive_query(
logger.info(f"Final context: {len(processed_chunks_with_ref_ids)} chunks")

# Build raw data structure for naive mode using processed chunks with reference IDs
raw_data = convert_to_user_format(
raw_data = await convert_to_user_format(
[], # naive mode has no entities
[], # naive mode has no relationships
processed_chunks_with_ref_ids,
reference_list,
"naive",
doc_status_storage=doc_status_storage,
include_metadata=query_param.include_metadata,
)

# Add complete metadata for naive mode
Expand Down
52 changes: 50 additions & 2 deletions lightrag/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3114,16 +3114,33 @@ def create_prefixed_exception(original_exception: Exception, prefix: str) -> Exc
)


def convert_to_user_format(
async def convert_to_user_format(
entities_context: list[dict],
relations_context: list[dict],
chunks: list[dict],
references: list[dict],
query_mode: str,
entity_id_to_original: dict = None,
relation_id_to_original: dict = None,
doc_status_storage=None,
include_metadata: bool = False,
) -> dict[str, Any]:
"""Convert internal data format to user-friendly format using original database data"""
"""Convert internal data format to user-friendly format using original database data

Args:
entities_context: List of entity context dicts
relations_context: List of relation context dicts
chunks: List of chunk dicts (with full_doc_id)
references: List of reference dicts
query_mode: Query mode string
entity_id_to_original: Mapping of entity IDs to original data
relation_id_to_original: Mapping of relation IDs to original data
doc_status_storage: Document status storage for metadata lookup (optional)
include_metadata: If True, retrieves metadata for each chunk using full_doc_id

Returns:
dict containing formatted data with optional metadata
"""

# Convert entities format using original data when available
formatted_entities = []
Expand Down Expand Up @@ -3201,6 +3218,28 @@ def convert_to_user_format(
}
)

# Fetch metadata if requested and doc_status_storage is provided
doc_id_to_metadata = {}
if include_metadata and doc_status_storage is not None:
# Collect unique full_doc_ids from chunks
unique_doc_ids = set()
for chunk in chunks:
full_doc_id = chunk.get("full_doc_id")
if full_doc_id:
unique_doc_ids.add(full_doc_id)

# Batch lookup metadata for all unique document IDs
if unique_doc_ids:
for doc_id in unique_doc_ids:
try:
doc_data = await doc_status_storage.get_by_id(doc_id)
if doc_data and "metadata" in doc_data:
doc_id_to_metadata[doc_id] = doc_data.get("metadata")
except Exception as e:
logger.warning(
f"[convert_to_user_format] Failed to fetch metadata for doc_id {doc_id}: {e}"
)

# Convert chunks format (chunks already contain complete data)
formatted_chunks = []
for i, chunk in enumerate(chunks):
Expand All @@ -3210,6 +3249,15 @@ def convert_to_user_format(
"file_path": chunk.get("file_path", "unknown_source"),
"chunk_id": chunk.get("chunk_id", ""),
}

# Add metadata if requested and available
if include_metadata:
full_doc_id = chunk.get("full_doc_id")
if full_doc_id and full_doc_id in doc_id_to_metadata:
chunk_data["metadata"] = doc_id_to_metadata[full_doc_id]
else:
chunk_data["metadata"] = None

formatted_chunks.append(chunk_data)

logger.debug(
Expand Down
15 changes: 15 additions & 0 deletions lightrag_webui/src/api/lightrag.ts
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,21 @@ export const deleteDocuments = async (
return response.data
}

export const getDocumentById = async (documentId: string): Promise<DocStatusResponse> => {
const response = await axiosInstance.get(`/documents/${documentId}`)
return response.data
}

export const updateDocumentMetadata = async (
documentId: string,
metadata: Record<string, any>
): Promise<DocActionResponse> => {
const response = await axiosInstance.patch(`/documents/${documentId}/metadata`, {
metadata
})
return response.data
}

export const getAuthStatus = async (): Promise<AuthStatusResponse> => {
try {
// Add a timeout to the request to prevent hanging
Expand Down
Loading
Loading