Update panchayat yaml to jsonl file to compile additional attributes

- Update typing for list to use the List object from typing module - Parse number of upvotes, created date - Add support for word filter and date filter on compiled entries
Resolve merge conflict in build.yml
2026-05-13 21:41:41 +00:00 · 2022-12-28 09:50:44 -03:00 · 2022-09-15 20:35:02 +03:00 · 2022-09-15 20:08:54 +03:00 · 2022-09-14 21:09:30 +03:00 · 2022-09-14 14:23:17 +03:00
20 changed files with 687 additions and 79 deletions
--- a/3
+++ b/3
@@ -14,6 +14,9 @@ WORKDIR /app
 RUN pip install --upgrade pip && \
    pip install --upgrade .

+# https://stackoverflow.com/questions/64776990/python-docker-no-module-found
+ENV PYTHONPATH /app
+
 # Run the Application
 # There are more arguments required for the application to run,
 # but these should be passed in through the docker-compose.yml file.
--- a/config/khoj_docker.yml
+++ b/config/khoj_docker.yml
@@ -2,39 +2,14 @@ content-type:
  # The /data/folder/ prefix to the folders is here because this is
  # the directory to which the local files are copied in the docker-compose.
  # If changing, the docker-compose volumes should also be changed to match.
-  org:
+  panchayat :
    input-files: null
-    input-filter: "/data/org/*.org"
-    compressed-jsonl: "/data/embeddings/notes.jsonl.gz"
-    embeddings-file: "/data/embeddings/note_embeddings.pt"
-    index_heading_entries: false
-
-  markdown:
-    input-files: null
-    input-filter: "/data/markdown/*.md"
-    compressed-jsonl: "/data/embeddings/markdown.jsonl.gz"
-    embeddings-file: "/data/embeddings/markdown_embeddings.pt"
-
-  ledger:
-    input-files: null
-    input-filter: /data/ledger/*.beancount
-    compressed-jsonl: /data/embeddings/transactions.jsonl.gz
-    embeddings-file: /data/embeddings/transaction_embeddings.pt
-
-  image:
-    input-directories: ["/data/images/"]
-    embeddings-file: "/data/embeddings/image_embeddings.pt"
-    batch-size: 50
-    use-xmp-metadata: false
-
-  music:
-    input-files: ["/data/music/music.org"]
-    input-filter: null
-    compressed-jsonl: "/data/embeddings/songs.jsonl.gz"
-    embeddings-file: "/data/embeddings/song_embeddings.pt"
+    input-filter: "/data/panchayat/*.yaml"
+    compressed-jsonl: "/data/embeddings/new/panchyat.jsonl.gz"
+    embeddings-file: "/data/embeddings/new/panchayat_embeddings.pt"

 search-type:
-  symmetric:
+  symmetric: 
    encoder: "sentence-transformers/all-MiniLM-L6-v2"
    cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
    model_directory: "/data/models/symmetric"
@@ -44,10 +19,6 @@ search-type:
    cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
    model_directory: "/data/models/asymmetric"

-  image:
-    encoder: "sentence-transformers/clip-ViT-B-32"
-    model_directory: "/data/models/image_encoder"
-
 processor:
  #conversation:
  #  openai-api-key: null
--- a/config/khoj_sample.yml
+++ b/config/khoj_sample.yml
@@ -1,35 +1,10 @@
 content-type:
-  org:
-    input-files:  # ["/path/to/org-file.org"]  REQUIRED IF input-filter IS NOT SET OR
-    input-filter: # /path/to/org/*.org         REQUIRED IF input-files IS NOT SET
-    compressed-jsonl: "~/.khoj/content/org/org.jsonl.gz"
-    embeddings-file: "~/.khoj/content/org/org_embeddings.pt"
-    index_heading_entries: false  # Set to true to index entries with empty body

-  markdown:
-    input-files:  # ["/path/to/markdown-file.md"]  REQUIRED IF input-filter IS NOT SET OR
-    input-filter: # "/path/to/markdown/*.md"       REQUIRED IF input-files IS NOT SET
-    compressed-jsonl: "~/.khoj/content/markdown/markdown.jsonl.gz"
-    embeddings-file: "~/.khoj/content/markdown/markdown_embeddings.pt"
-
-  ledger:
-    input-files:  # ["/path/to/ledger-file.beancount"]  REQUIRED IF input-filter is not set OR
-    input-filter: # /path/to/ledger/*.beancount         REQUIRED IF input-files is not set
-    compressed-jsonl: "~/.khoj/content/ledger/ledger.jsonl.gz"
-    embeddings-file: "~/.khoj/content/ledger/ledger_embeddings.pt"
-
-  image:
-    input-directories: # ["/path/to/images/"]   REQUIRED IF input-filter IS NOT SET OR
-    input-filter:      # /path/to/images/*.jpg  REQUIRED IF input-directories IS NOT SET
-    embeddings-file: "~/.khoj/content/image/image_embeddings.pt"
-    batch-size: 50
-    use-xmp-metadata: false
-
-  music:
-    input-files:  # ["/path/to/music-file.org"] REQUIRED IF input-filter IS NOT SET OR
-    input-filter: # /path/to/music/*.org        REQUIRED IF input-files IS NOT SET
-    compressed-jsonl: "~/.khoj/content/music/music.jsonl.gz"
-    embeddings-file: "~/.khoj/content/music/music_embeddings.pt"
+  panchayat:
+    input-files: null
+    input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"]
+    compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz"
+    embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt"

 search-type:
  symmetric:
@@ -47,6 +22,3 @@ search-type:
    model_directory: "~/.khoj/search/image/"

 processor:
-  conversation:
-    openai-api-key: # "YOUR_OPENAI_API_KEY"
-    conversation-logfile: "~/.khoj/processor/conversation/conversation_logs.json"
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,7 +1,8 @@
 version: "3.9"
 services:
  server:
-    image: ghcr.io/debanjum/khoj:latest
+    build: .
+    # image: ghcr.io/debanjum/khoj:latest
    ports:
      # If changing the local port (left hand side), no other changes required.
      # If changing the remote port (right hand side), 
@@ -21,9 +22,10 @@ services:
      - ./tests/data/ledger/:/data/ledger/
      - ./tests/data/music/:/data/music/
      - ./tests/data/markdown/:/data/markdown/
+      - /home/saba/projects/panchayat/panchayat/instance/:/data/panchayat/
      # Embeddings and models are populated after the first run
      # You can set these volumes to point to empty directories on host
      - ./tests/data/embeddings/:/data/embeddings/
      - ./tests/data/models/:/data/models/
    # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
-    command: --no-gui --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv
+    command: --no-gui -c=config/khoj_docker.yml --host="0.0.0.0" --port=8000 -vv
--- a/panchayat/init.py
+++ b/panchayat/init.py
--- a/panchayat/vdb.py
+++ b/panchayat/vdb.py
@@ -0,0 +1,487 @@
+"""
+Virtual Database that acts as an abstraction to the actual database.
+VDB is the python representation of the on disk database.
+VDB exposes methods to read/edit the database.
+VDB can be serialized/deserialized to on disk db.
+"""
+
+import datetime
+import re
+from enum import Enum, auto
+from typing import List, Optional, Tuple
+
+import yaml
+
+# from libgravatar import Gravatar  # type: ignore
+
+
+class Visibility(Enum):
+    """
+    Enum to represent visibility levels for a post
+    """
+    Aham = auto()  # only visible to the author
+    Gram = auto()  # visible to all logged in users
+    Lok = auto()  # visible to everyone without log in
+
+
+class VDB:
+    """
+    Python abstraction of panchayat DB
+    """
+
+    # pylint: disable=too-few-public-methods
+    def __init__(self, outfile: str = None):
+        self.users = UserList()
+        self.posts = PostTree()
+        self.outfile = outfile
+
+    def commit(self):
+        """
+        serialize the virtual database to disk overwriting existing file
+        """
+        if not self.outfile:
+            raise RuntimeError("Outfile is empty")
+
+        with open(self.outfile, 'w') as outfile:
+            yaml.dump(self, outfile)
+
+        # git commit
+
+
+class User:
+    """
+    Class to represent a user on panchayat
+    """
+
+    # pylint: disable=too-few-public-methods
+    def __init__(self,
+                 username: str,
+                 password: str,
+                 token: str = None,
+                 email: str = None,
+                 email_updates: bool = False):
+        # pylint: disable=too-many-arguments
+        self.username = username  # primary key
+        self.password = password  #hash
+        self.token = token
+        self.email = email
+        self.email_updates = email_updates
+
+    def __str__(self) -> str:
+        return self.username
+
+    def gravatar_url(self) -> str:
+        """
+        Return gravatar image url for the user.
+        If user has email, then email is used to generate image.
+        Else username is used to generate image.
+        """
+        key = self.email if self.email else self.username
+        return key
+        # libgrav = Gravatar(key)
+        # return libgrav.get_image(size=200, default="identicon", use_ssl=True)
+
+
+class UserList(list):
+    """
+    List of users
+    """
+    def find(self, username: str) -> Optional[User]:
+        """
+        Find user by username
+        """
+        user = [user for user in self if user.username == username]
+        if not user:
+            return None
+        if len(user) != 1:
+            raise RuntimeError("More than one user found for username")
+        return user[0]
+
+
+class Post:  # pylint: disable=too-many-instance-attributes
+    """
+    Class to represent a post on panchayat.
+    Inherited by LinkPost and TextPost
+    """
+    def __init__(
+        self,
+        author: User,
+        title: str,
+        body: str,
+        visibility: Visibility = Visibility.Gram,
+        upvotes=None,
+        downvotes=None,
+        created=None,
+        parent: "Post" = None,
+        post_id: int = None,
+    ):
+        # pylint: disable=too-many-arguments
+        self.post_id = post_id  # need id for permalink
+        self.author = author
+        self.created = created \
+            if created is not None else datetime.datetime.now()
+        self.title = title
+        self.body = body
+        self.upvotes = upvotes if upvotes else set()
+        self.downvotes = downvotes if downvotes else set()
+        self.children: List[Post] = []
+        self.parent = parent
+        self.depth: int = parent.depth + 1 if parent else 0
+
+        if (self.parent and self.parent.visibility == visibility.Aham
+                and self.parent.author != self.author):
+            raise RuntimeError("Cannot reply to someone else's aham post")
+        self.visibility = visibility  # set visibility using setter
+
+    @property
+    def target_visibility(self):
+        """
+        Getter method for visibility
+        """
+        return self._visibility
+
+    @property
+    def visibility(self):
+        """
+        Getter method for visibility
+
+        Visibility can be lower than target if some ancestor has lower visibility.
+        When the ancestor reaches the requested target visibility,
+        self will automatically reach target visibility as well.
+        """
+        if self.parent and self.parent.visibility.value < self._visibility.value:
+            return self.parent.visibility
+        return self._visibility
+
+    @visibility.setter
+    def visibility(self, other: Visibility):
+        """
+        Setter method for visibility
+        If self is being made aham then parent and all descendants must be by same author
+        While setting visibility, all descendants are capped to self visibility level
+
+        This setter sets _visibility property. This sets the target visibility.
+        But, the actual visibility can stay lower if some ancestor has lower visibility.
+        """
+        if other == Visibility.Aham:
+            if any([
+                    descendant.author != self.author
+                    for descendant in self.descendants
+            ]):
+                raise RuntimeError(
+                    "Cannot make post Aham if there are children owned by others"
+                )
+
+        self._visibility = other
+
+    def visibility_detail_string(self):
+        """
+        The detailed string for visibility
+        "(Visibility.name requested)" if some descendant has a higher target visibility
+        "(Visibility.name pending)" if some ancestor is preventing this post from target visibility
+        """
+        ret = ''
+        if self.target_visibility != self.visibility:
+            ret += f'({self.target_visibility.name} pending)'
+        if self.children:
+            max_visibility_request = max([
+                descendant.target_visibility for descendant in self.descendants
+            ],
+                                         key=lambda x: x.value)
+            if max_visibility_request.value > self.target_visibility.value:
+                ret += f'({max_visibility_request.name} requested)'
+        return ret
+
+    def is_visible_to(self, user: User = None) -> bool:
+        """
+        Returns True if self is visible to user, else False
+        """
+        if self.visibility == Visibility.Lok:
+            return True
+        if self.visibility == Visibility.Gram and user:
+            return True
+        if self.visibility == Visibility.Aham and self.author == user:
+            return True
+        return False
+
+    @property
+    def descendants(self) -> List["Post"]:
+        """
+        Return all my descendants with inorder traversal
+        Does not include self
+        """
+        my_descendants = []  # list(self.children)
+        for child in sorted(self.children, key=lambda post: post.created):
+            my_descendants.append(child)
+            my_descendants.extend(child.descendants)
+        return my_descendants
+
+    @property
+    def family(self) -> List["Post"]:
+        """
+        Return list of posts in family
+        Two posts belong to same family if they share the same TLP
+        """
+        return self.tlp.descendants_and_i
+
+    @property
+    def descendants_and_i(self) -> List["Post"]:
+        """
+        Return all my descendants with inorder traversal
+        Includes self
+        """
+        return [self] + self.descendants
+
+    @property
+    def ancestors(self) -> List["Post"]:
+        """
+        Return all my ancestors oldest first
+        Does not include self
+        """
+        if self.parent:
+            return self.parent.ancestors + [self.parent]
+        return []
+
+    @property
+    def ancestry(self) -> List["Post"]:
+        """
+        Return all my ancestors including self
+        """
+        if self.parent:
+            return self.parent.ancestry + [self]
+        return [self]
+
+    @property
+    def tlp(self) -> "Post":
+        """
+        Return my top level post
+        """
+        if self.is_tlp():
+            return self
+        return self.parent.tlp  #type: ignore
+
+    def is_tlp(self) -> bool:
+        """
+        Return True if I am a top level post
+        """
+        return self.depth == 0
+
+    def is_leaf(self) -> bool:
+        """
+        Return True if I am a leaf post
+        """
+        return not self.children
+
+    @property
+    def vote_count(self) -> int:
+        """
+        Return the effective vote count of this post. upvote - downvote
+        """
+        return len(self.upvotes) - len(self.downvotes)
+
+    def upvote_string(self) -> str:
+        """
+        Return string of all users who have upvoted this post
+        """
+        return ', '.join([user.username for user in self.upvotes])
+
+    def downvote_string(self) -> str:
+        """
+        Return string of all users who have downvoted this post
+        """
+        return ', '.join([user.username for user in self.downvotes])
+
+    def __str__(self) -> str:
+        if self.title:
+            return self.title
+        return self.body
+
+    def nullvote(self, user: User):
+        """
+        Remove user's vote from this post
+        """
+        self.upvotes.discard(user)
+        self.downvotes.discard(user)
+
+    def upvote(self, user: User):
+        """
+        Upvote this post. Upvote is done by voiding previous vote and creating new one.
+        """
+        self.nullvote(user)
+        self.upvotes.add(user)
+
+    def downvote(self, user: User):
+        """
+        Downvote this post. Downvote is done by voiding previous vote and creating new one.
+        """
+        self.nullvote(user)
+        self.downvotes.add(user)
+
+    def delete(self):
+        """
+        Delete this post. Does not remove the post from db,
+        but only overwrites title and body with 'DELETED'.
+        This is done to not break other posts that have reference to the deleted one.
+        """
+        self.title = "DELETED"
+        self.body = "DELETED"
+
+    def family_last_modified(self) -> datetime.datetime:
+        """
+        Return when the post family was last modified
+        Max of created for all posts in family
+        """
+        return max([post.created for post in self.family])
+
+
+class LinkPost(Post):
+    """
+    Class to represent a link post on panchayat
+    """
+    def is_url(self) -> bool:  # pylint: disable=missing-function-docstring, no-self-use
+        return True
+
+
+class TextPost(Post):
+    """
+    Class to represent a text post on panchayat
+    """
+    def is_url(self) -> bool:  #pylint: disable=missing-function-docstring, no-self-use
+        return False
+
+    @property
+    def html_body(self) -> str:
+        """
+        Return html string with all urls in body converted to hrefs
+        Regex taken from https://urlregex.com/
+        Trailing period and parenthesis was appended to remove false positives
+        """
+        # pylint: disable=line-too-long
+        url_regex = re.compile(
+            r'''(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+[^\. \)])'''
+        )
+        return url_regex.sub(r'<a href="\1" target="_blank">\1</a>', self.body)
+
+
+class PostTree:
+    """
+    Class to represent a tree of posts
+    """
+    def __init__(self):
+        self.tlps = []
+
+    def zig_zag(self) -> List[Post]:
+        """
+        Return all posts in zig zag order.
+        TLPs are in reverse chronological order.
+        Comments are ordered chrnonologically.
+        """
+        all_posts = []
+        reverse_chrono_tlps = sorted(self.tlps,
+                                     key=lambda post: post.created,
+                                     reverse=True)
+        for tlp in reverse_chrono_tlps:
+            all_posts.append(tlp)
+            all_posts.extend(tlp.descendants)
+        return all_posts
+
+    def compressed_reverse_chrono_ancestry(
+            self, requesting_user: User) -> List[Tuple[Post, bool, bool]]:
+        """
+        Returns a list of all posts with their ancestors.
+        The post is attached to two boolean fields wrapped inside a tuple
+        for use by the jinja template.
+        First boolean indicates whether this post must be highlighted.
+        Second boolean indicates whether a new TLP boundary has reached.
+        Ancestry is not repeated when the subsequent post shares ancestors.
+        This query is used in the activity view.
+        """
+        ret: List[Tuple[Post, bool, bool]] = []
+        prev_ancestors: List[Post] = []
+        prev_tlp: Optional[Post] = None
+        for post in self.reverse_chrono():
+            if not post.is_visible_to(requesting_user):
+                continue
+            if prev_tlp and post.tlp is not prev_tlp:
+                # make tlp_switch true for the previous post
+                ret[-1] = (ret[-1][0], ret[-1][1], True)
+            ret.extend([(ancestor, False, False) for ancestor in post.ancestors
+                        if ancestor not in prev_ancestors])
+            # add current post with highlight true
+            ret.append((post, True, False))
+            prev_ancestors = post.ancestry
+            prev_tlp = post.tlp
+        return ret
+
+    def all(self) -> List[Post]:
+        """
+        Return list of all posts in any order.
+        Currently zig_zag order.
+        """
+        return self.zig_zag()
+
+    def reverse_chrono(self) -> List[Post]:
+        """
+        Return all posts in reverse chronological order
+        """
+        return sorted(self.all(), key=lambda post: post.created, reverse=True)
+
+    def find(self, post_id: int) -> Optional[Post]:
+        """
+        Find a post by post id
+        """
+        post = [post for post in self.all() if post.post_id == post_id]
+        if not post:
+            return None
+        if len(post) != 1:
+            raise RuntimeError(
+                "There should only have been one post with a given id")
+        return post[0]
+
+    def insert(self, post: Post):
+        """
+        Insert a post into the posttree.
+        If the post does not have an id already assign the smallest available one.
+        If post has a parent add the post as child of parent.
+        Else add the post as a TLP.
+        """
+        if post.post_id is None:
+            post.post_id = max(  #type:ignore
+                [post.post_id for post in self.all()],
+                default=0) + 1  #type: ignore
+
+        if self.find(post.post_id) is not None:
+            raise RuntimeError("Posttree already contains post with id")
+
+        if post.parent is None:
+            self.tlps.append(post)
+        else:
+            post.parent.children.append(post)
+
+    def tlp_count(self, user: User) -> int:
+        """
+        Return #TLPs by the user
+        """
+        return len([post for post in self.tlps if post.author == user])
+
+    def comment_count(self, user: User) -> int:
+        """
+        Return #comments by user
+        """
+        return len([
+            post for post in self.all()
+            if post.depth != 0 and post.author == user
+        ])
+
+    def upvote_count(self, user: User) -> int:
+        """
+        Return #upvotes by user
+        """
+        return len([post for post in self.all() if user in post.upvotes])
+
+    def downvote_count(self, user: User) -> int:
+        """
+        Return #downvotes by user
+        """
+        return len([post for post in self.all() if user in post.downvotes])
+
--- a/src/configure.py
+++ b/src/configure.py
@@ -8,6 +8,7 @@ import json
 # Internal Packages
 from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
 from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
+from src.processor.panchayat.panchayat_to_jsonl import panchayat_to_jsonl
 from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 from src.search_type import image_search, text_search
 from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@@ -70,6 +71,16 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
            regenerate=regenerate,
            filters=[DateFilter(), WordFilter(), FileFilter()])

+    # Initialize Panchayat Search
+    if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
+        # Extract Entries, Generate Yaml Embeddings
+        model.panchayat_search = text_search.setup(
+            panchayat_to_jsonl,
+            config.content_type.panchayat,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate,
+            filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')])
+
    # Initialize Ledger Search
    if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
        # Extract Entries, Generate Ledger Embeddings
--- a/src/interface/web/index.html
+++ b/src/interface/web/index.html
@@ -117,7 +117,7 @@

        function populate_type_dropdown() {
            // Populate type dropdown field with enabled search types only
-            var possible_search_types = ["org", "markdown", "ledger", "music", "image"];
+            var possible_search_types = ["org", "markdown", "ledger", "music", "image", "panchayat"];
            fetch("/config/data")
                .then(response => response.json())
                .then(data => {
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_
 from src.utils.constants import empty_escape_sequences
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils.rawconfig import TextContentConfig
+from typing import List


 logger = logging.getLogger(__name__)
@@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files):
    return entries, dict(transaction_to_file_map)


-def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
+def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]:
    "Convert each Beancount transaction into a dictionary"
    entry_maps = []
    for entry in entries:
@@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
    return entry_maps


-def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
+def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str:
    "Convert each Beancount transaction dictionary to JSON and collate as JSONL"
    return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils.rawconfig import TextContentConfig

+from typing import List

 logger = logging.getLogger(__name__)

@@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files):
    return entries, dict(entry_to_file_map)


-def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
+def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]:
    "Convert each Markdown entries into a dictionary"
    entry_maps = []
    for entry in entries:
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 from src.utils import state
 from src.utils.rawconfig import TextContentConfig

+from typing import List
+

 logger = logging.getLogger(__name__)

@@ -105,7 +107,7 @@ def extract_org_entries(org_files):
    return entries, dict(entry_to_file_map)


-def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
+def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]:
    "Convert Org-Mode entries into list of dictionary"
    entry_maps = []
    for entry in entries:
--- a/src/processor/panchayat/init.py
+++ b/src/processor/panchayat/init.py
--- a/src/processor/panchayat/panchayat_to_jsonl.py
+++ b/src/processor/panchayat/panchayat_to_jsonl.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+
+# Standard Packages
+import json
+import logging
+import glob
+import yaml
+
+# Internal Packages
+from panchayat import vdb
+from src.utils.helpers import get_absolute_path, is_none_or_empty
+from src.utils.jsonl import dump_jsonl, compress_jsonl_data
+from src.utils.rawconfig import TextContentConfig
+
+logger = logging.getLogger(__name__)
+
+def panchayat_constructor(loader, node):
+    fields = loader.construct_mapping(node)
+    return vdb.VDB(**fields)
+
+
+class VDBEntry():
+    post_id: str
+    body: str
+    title: str
+    author: str
+
+    def __init__(self, post_id, body, title, author):
+        self.post_id = post_id
+        self.body = body
+        self.title = title
+        self.author = author
+
+
+# Define Functions
+def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None):
+
+    # Input Validation
+    if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter):
+        print("At least one of input-files or input-file-filter is required to be specified")
+        exit(1)
+
+    # Get Markdown Files to Process
+    yaml_files = get_panchayat_files(config.input_files, config.input_filter)
+
+    output_file = config.compressed_jsonl
+
+    # Extract Entries from specified Markdown files
+    entries = extract_panchayat_entries(yaml_files)
+
+    # Process Each Entry from All Notes Files
+    jsonl_data = convert_panchayat_entries_to_jsonl(entries)
+
+    # Compress JSONL formatted Data
+    if output_file.suffix == ".gz":
+        compress_jsonl_data(jsonl_data, output_file)
+    elif output_file.suffix == ".jsonl":
+        dump_jsonl(jsonl_data, output_file)
+
+    return list(enumerate(entries))
+
+
+def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
+    "Get the Panchayat file to process"
+    absolute_yaml_files, filtered_yaml_files = set(), set()
+    if yaml_files:
+        absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
+    if yaml_file_filter:
+        filtered_yaml_files = {
+            filtered_file
+            for filter in yaml_file_filter
+            for filtered_file in glob.glob(get_absolute_path(filter))
+        }
+
+    all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files)
+
+    files_with_non_yaml_extensions = {
+        yaml_file
+        for yaml_file
+        in all_yaml_files
+        if not yaml_file.endswith(".yaml") and not yaml_file.endswith(".yml")
+    }
+
+    if any(files_with_non_yaml_extensions):
+        logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
+
+    if verbose > 0:
+        print(f'Processing files: {all_yaml_files}')
+
+    return all_yaml_files
+
+
+def extract_panchayat_entries(yaml_files):
+    "Extract entries by post from specified Yaml files"
+
+    entries = []
+    for yaml_file in yaml_files:
+        with open(yaml_file) as f:
+
+            raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)
+
+            seen_ids = set()
+
+            for post in raw_data.posts.zig_zag():
+                all_subposts = post.descendants_and_i
+                for subpost in all_subposts:
+                    if subpost.post_id not in seen_ids:
+                        seen_ids.add(subpost.post_id)
+                        entry = dict()
+                        
+                        entry['compiled'] = f"""body: {subpost.body}
+                            author: {subpost.author.username}
+                            title: {subpost.title}
+                            created: {subpost.created}
+                            upvotes: {len(subpost.upvotes)}"""
+                        
+                        entry['raw'] = subpost.post_id
+                        entries.append(entry)
+
+    return entries
+
+
+def convert_panchayat_entries_to_jsonl(entries, verbose=0):
+    "Convert each Panchayat Yaml entry to JSON and collate as JSONL"
+    # jsonl = ''
+    # for entry in entries:
+    #     entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]}
+    #     # Convert Dictionary to JSON and Append to JSONL string
+    #     jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
+
+
+    # if verbose > 0:
+    #     logger.info(f"Converted {len(entries)} to jsonl format")
+
+    # return jsonl
+
+    return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
+
--- a/src/router.py
+++ b/src/router.py
@@ -109,6 +109,17 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
        results = text_search.collate_results(hits, entries, results_count)
        collate_end = time.time()

+    if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
+        # query Panchayat yaml files
+        query_start = time.time()
+        hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r)
+        query_end = time.time()
+
+        # collate and return results
+        collate_start = time.time()
+        results = text_search.collate_results(hits, entries, results_count)
+        collate_end = time.time()
+
    if (t == SearchType.Image or t == None) and state.model.image_search:
        # query images
        query_start = time.time()
--- a/src/search_filter/base_filter.py
+++ b/src/search_filter/base_filter.py
@@ -1,6 +1,8 @@
 # Standard Packages
 from abc import ABC, abstractmethod

+from typing import List, Set, Tuple
+

 class BaseFilter(ABC):
    @abstractmethod
@@ -12,5 +14,5 @@ class BaseFilter(ABC):
        pass

    @abstractmethod
-    def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]:
+    def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]:
        pass
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel
 from src.utils.rawconfig import TextSearchConfig, TextContentConfig
 from src.utils.jsonl import load_jsonl

+from typing import List

 logger = logging.getLogger(__name__)

@@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5):
        in hits[0:count]]


-def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
+def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel:
    # Initialize Model
    bi_encoder, cross_encoder, top_k = initialize_model(search_config)

--- a/src/utils/cli.py
+++ b/src/utils/cli.py
@@ -7,7 +7,6 @@ from importlib.metadata import version
 from src.utils.helpers import resolve_absolute_path
 from src.utils.yaml import parse_config_from_file

-
 def cli(args=None):
    # Setup Argument Parser for the Commandline Interface
    parser = argparse.ArgumentParser(description="Start Khoj; A Natural Language Search Engine for your personal Notes, Transactions and Photos")
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -7,12 +7,15 @@ from pathlib import Path
 from src.utils.rawconfig import ConversationProcessorConfig
 from src.search_filter.base_filter import BaseFilter

+from typing import List
+

 class SearchType(str, Enum):
    Org = "org"
    Ledger = "ledger"
    Music = "music"
    Markdown = "markdown"
+    Panchayat = "panchayat"
    Image = "image"


@@ -21,7 +24,7 @@ class ProcessorType(str, Enum):


 class TextSearchModel():
-    def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k):
+    def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k):
        self.entries = entries
        self.corpus_embeddings = corpus_embeddings
        self.bi_encoder = bi_encoder
@@ -45,6 +48,7 @@ class SearchModels():
    ledger_search: TextSearchModel = None
    music_search: TextSearchModel = None
    markdown_search: TextSearchModel = None
+    panchayat_search: TextSearchModel = None
    image_search: ImageSearchModel = None


--- a/src/utils/rawconfig.py
+++ b/src/utils/rawconfig.py
@@ -44,6 +44,7 @@ class ContentConfig(ConfigBase):
    ledger: Optional[TextContentConfig]
    image: Optional[ImageContentConfig]
    music: Optional[TextContentConfig]
+    panchayat: Optional[TextContentConfig]
    markdown: Optional[TextContentConfig]

 class TextSearchConfig(ConfigBase):
--- a/src/utils/state.py
+++ b/src/utils/state.py
@@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel
 from src.utils.helpers import LRU
 from src.utils.rawconfig import FullConfig

+from typing import List
+
 # Application Global State
 config = FullConfig()
 model = SearchModels()
@@ -18,7 +20,7 @@ config_file: Path = None
 verbose: int = 0
 host: str = None
 port: int = None
-cli_args: list[str] = None
+cli_args: List[str] = None
 query_cache = LRU()

 if torch.cuda.is_available():
Author	SHA1	Message	Date
Saba	21a9fbcea3	Update panchayat yaml to jsonl file to compile additional attributes - Update typing for list to use the List object from typing module - Parse number of upvotes, created date - Add support for word filter and date filter on compiled entries	2022-12-28 09:50:44 -03:00
Saba	21eb58156c	Resolve merge conflict in build.yml	2022-09-15 20:35:02 +03:00
Saba	63f2312b84	Update panchayat-to-jsonl changes to merge with master	2022-09-15 20:08:54 +03:00
Saba	f12ca56e93	Merge intermediate changes	2022-09-14 21:09:30 +03:00
Saba	ea62d47aa5	Address merge conflicts from master branch	2022-09-14 14:23:17 +03:00
Saba	240901e07f	Add workflow dispatch support in build.yml - To support dispatch, set the image label based on the branch name - Master build should still be tagged with latest to get benefit of the standard production Docker label	2022-09-14 13:32:44 +03:00
Saba	13986f0e92	Adjust arguments in khoj_docker.yml for support	2022-09-14 11:47:38 +03:00
Saba	2e0ad6c8a1	Update to use new argument pattern for khoj in docker-compose	2022-09-14 11:45:47 +03:00
Saba	365ab0c00e	Remove unnecessary panchayat import in rawconfig.py	2022-09-14 11:41:44 +03:00
Saba	3b5f9814d8	Revert cli.py	2022-08-27 18:06:52 +03:00
Saba	da59ec2917	Refactor Khoj - Panchayat integration to represent minimal changes required + name the new endpoint Panchayat	2022-08-27 18:05:30 +03:00
Saba	1e15d266da	#4 : Add VDB metadata to the compiled field of data for the jsonl processor, and return only the post_id in the raw entry	2022-08-27 17:31:30 +03:00
Saba	630abf2e17	Move vdb to a separate panchayat folder to preserve naming convention in yaml file	2022-08-11 18:14:23 -04:00
Saba	84e3211a09	Initial (hacky) solution to support search for Panchayat db	2022-08-10 18:11:36 -04:00