Distill certain relevant trains of thought during execution

2026-05-13 21:41:41 +00:00 · 2025-01-24 22:17:57 -08:00
parent a3b5ec4737
commit 01ea7c382e
1 changed files with 70 additions and 0 deletions
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -1,4 +1,5 @@
 import base64
+import csv
 import json
 import logging
 import math
@@ -11,6 +12,7 @@ from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from io import BytesIO
+from pathlib import Path
 from time import perf_counter
 from typing import Any, Callable, Dict, List, Optional

@@ -293,6 +295,74 @@ def save_to_conversation_log(
    if generated_mermaidjs_diagram:
        khoj_message_metadata["mermaidjsDiagram"] = generated_mermaidjs_diagram

+    # Get log file path in same directory as script
+    log_file = os.path.join(os.path.dirname(__file__), "research_execution.csv")
+
+    # Open in append mode
+    with open(log_file, "a+", newline="") as f:
+        writer = csv.writer(f)
+
+        # Write headers if file is empty
+        f.seek(0)
+        if not f.read():
+            writer.writerow(["timestamp", "query", "context", "response"])
+
+        EXCLUDED_PHRASES = [
+            "**Generating a well-informed response**",
+            "**Searching the Internet for**",
+            "**Running code snippet**",
+        ]
+
+        trains_of_thought = ""
+        for t in train_of_thought:
+            tot_contains_excluded_phrase = any(phrase in t["data"] for phrase in EXCLUDED_PHRASES)
+            if t["type"] == "status" and not tot_contains_excluded_phrase:
+                trains_of_thought += t["data"] + "\n\n"
+
+        # Write new row
+        writer.writerow(
+            [
+                datetime.now().isoformat(),
+                q,
+                trains_of_thought,
+                chat_response,
+            ]
+        )
+
+    # Formatted data
+    formatted_data_file = Path(__file__).parent / "research_execution.json"
+
+    # Create conversation object
+    new_conversation = {
+        "conversation": [
+            {"from": "user", "value": q},
+            {
+                "from": "assistant",
+                "value": f"<begin_of_thought>\n\n{trains_of_thought}<end_of_thought>\n\n<begin_of_solution>\n\n{chat_response}<end_of_solution>",
+            },
+        ]
+    }
+
+    # Load existing data or create new list
+    if formatted_data_file.exists():
+        with open(formatted_data_file) as f:
+            try:
+                conversations = json.load(f)
+            except json.JSONDecodeError:
+                conversations = []
+    else:
+        conversations = []
+
+    # Append new conversation
+    conversations.append(new_conversation)
+
+    # Write back atomically
+    temp_file = formatted_data_file.with_suffix(".tmp")
+    with open(temp_file, "w") as f:
+        json.dump(conversations, f, indent=2)
+
+    temp_file.replace(formatted_data_file)
+
    updated_conversation = message_to_log(
        user_message=q,
        chat_response=chat_response,