Compare commits

...

14 Commits

Author SHA1 Message Date
Saba
21a9fbcea3 Update panchayat yaml to jsonl file to compile additional attributes
- Update typing for list to use the List object from typing module
- Parse number of upvotes, created date
- Add support for word filter and date filter on compiled entries
2022-12-28 09:50:44 -03:00
Saba
21eb58156c Resolve merge conflict in build.yml 2022-09-15 20:35:02 +03:00
Saba
63f2312b84 Update panchayat-to-jsonl changes to merge with master 2022-09-15 20:08:54 +03:00
Saba
f12ca56e93 Merge intermediate changes 2022-09-14 21:09:30 +03:00
Saba
ea62d47aa5 Address merge conflicts from master branch 2022-09-14 14:23:17 +03:00
Saba
240901e07f Add workflow dispatch support in build.yml
- To support dispatch, set the image label based on the branch name
- Master build should still be tagged with latest to get benefit of the standard production Docker label
2022-09-14 13:32:44 +03:00
Saba
13986f0e92 Adjust arguments in khoj_docker.yml for support 2022-09-14 11:47:38 +03:00
Saba
2e0ad6c8a1 Update to use new argument pattern for khoj in docker-compose 2022-09-14 11:45:47 +03:00
Saba
365ab0c00e Remove unnecessary panchayat import in rawconfig.py 2022-09-14 11:41:44 +03:00
Saba
3b5f9814d8 Revert cli.py 2022-08-27 18:06:52 +03:00
Saba
da59ec2917 Refactor Khoj - Panchayat integration to represent minimal changes required + name the new endpoint Panchayat 2022-08-27 18:05:30 +03:00
Saba
1e15d266da #4: Add VDB metadata to the compiled field of data for the jsonl processor, and return only the post_id in the raw entry 2022-08-27 17:31:30 +03:00
Saba
630abf2e17 Move vdb to a separate panchayat folder to preserve naming convention in yaml file 2022-08-11 18:14:23 -04:00
Saba
84e3211a09 Initial (hacky) solution to support search for Panchayat db 2022-08-10 18:11:36 -04:00
20 changed files with 687 additions and 79 deletions

View File

@@ -14,6 +14,9 @@ WORKDIR /app
RUN pip install --upgrade pip && \
pip install --upgrade .
# https://stackoverflow.com/questions/64776990/python-docker-no-module-found
ENV PYTHONPATH /app
# Run the Application
# There are more arguments required for the application to run,
# but these should be passed in through the docker-compose.yml file.

View File

@@ -2,39 +2,14 @@ content-type:
# The /data/folder/ prefix to the folders is here because this is
# the directory to which the local files are copied in the docker-compose.
# If changing, the docker-compose volumes should also be changed to match.
org:
panchayat :
input-files: null
input-filter: "/data/org/*.org"
compressed-jsonl: "/data/embeddings/notes.jsonl.gz"
embeddings-file: "/data/embeddings/note_embeddings.pt"
index_heading_entries: false
markdown:
input-files: null
input-filter: "/data/markdown/*.md"
compressed-jsonl: "/data/embeddings/markdown.jsonl.gz"
embeddings-file: "/data/embeddings/markdown_embeddings.pt"
ledger:
input-files: null
input-filter: /data/ledger/*.beancount
compressed-jsonl: /data/embeddings/transactions.jsonl.gz
embeddings-file: /data/embeddings/transaction_embeddings.pt
image:
input-directories: ["/data/images/"]
embeddings-file: "/data/embeddings/image_embeddings.pt"
batch-size: 50
use-xmp-metadata: false
music:
input-files: ["/data/music/music.org"]
input-filter: null
compressed-jsonl: "/data/embeddings/songs.jsonl.gz"
embeddings-file: "/data/embeddings/song_embeddings.pt"
input-filter: "/data/panchayat/*.yaml"
compressed-jsonl: "/data/embeddings/new/panchyat.jsonl.gz"
embeddings-file: "/data/embeddings/new/panchayat_embeddings.pt"
search-type:
symmetric:
symmetric:
encoder: "sentence-transformers/all-MiniLM-L6-v2"
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "/data/models/symmetric"
@@ -44,10 +19,6 @@ search-type:
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_directory: "/data/models/asymmetric"
image:
encoder: "sentence-transformers/clip-ViT-B-32"
model_directory: "/data/models/image_encoder"
processor:
#conversation:
# openai-api-key: null

View File

@@ -1,35 +1,10 @@
content-type:
org:
input-files: # ["/path/to/org-file.org"] REQUIRED IF input-filter IS NOT SET OR
input-filter: # /path/to/org/*.org REQUIRED IF input-files IS NOT SET
compressed-jsonl: "~/.khoj/content/org/org.jsonl.gz"
embeddings-file: "~/.khoj/content/org/org_embeddings.pt"
index_heading_entries: false # Set to true to index entries with empty body
markdown:
input-files: # ["/path/to/markdown-file.md"] REQUIRED IF input-filter IS NOT SET OR
input-filter: # "/path/to/markdown/*.md" REQUIRED IF input-files IS NOT SET
compressed-jsonl: "~/.khoj/content/markdown/markdown.jsonl.gz"
embeddings-file: "~/.khoj/content/markdown/markdown_embeddings.pt"
ledger:
input-files: # ["/path/to/ledger-file.beancount"] REQUIRED IF input-filter is not set OR
input-filter: # /path/to/ledger/*.beancount REQUIRED IF input-files is not set
compressed-jsonl: "~/.khoj/content/ledger/ledger.jsonl.gz"
embeddings-file: "~/.khoj/content/ledger/ledger_embeddings.pt"
image:
input-directories: # ["/path/to/images/"] REQUIRED IF input-filter IS NOT SET OR
input-filter: # /path/to/images/*.jpg REQUIRED IF input-directories IS NOT SET
embeddings-file: "~/.khoj/content/image/image_embeddings.pt"
batch-size: 50
use-xmp-metadata: false
music:
input-files: # ["/path/to/music-file.org"] REQUIRED IF input-filter IS NOT SET OR
input-filter: # /path/to/music/*.org REQUIRED IF input-files IS NOT SET
compressed-jsonl: "~/.khoj/content/music/music.jsonl.gz"
embeddings-file: "~/.khoj/content/music/music_embeddings.pt"
panchayat:
input-files: null
input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"]
compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz"
embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt"
search-type:
symmetric:
@@ -47,6 +22,3 @@ search-type:
model_directory: "~/.khoj/search/image/"
processor:
conversation:
openai-api-key: # "YOUR_OPENAI_API_KEY"
conversation-logfile: "~/.khoj/processor/conversation/conversation_logs.json"

View File

@@ -1,7 +1,8 @@
version: "3.9"
services:
server:
image: ghcr.io/debanjum/khoj:latest
build: .
# image: ghcr.io/debanjum/khoj:latest
ports:
# If changing the local port (left hand side), no other changes required.
# If changing the remote port (right hand side),
@@ -21,9 +22,10 @@ services:
- ./tests/data/ledger/:/data/ledger/
- ./tests/data/music/:/data/music/
- ./tests/data/markdown/:/data/markdown/
- /home/saba/projects/panchayat/panchayat/instance/:/data/panchayat/
# Embeddings and models are populated after the first run
# You can set these volumes to point to empty directories on host
- ./tests/data/embeddings/:/data/embeddings/
- ./tests/data/models/:/data/models/
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
command: --no-gui --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv
command: --no-gui -c=config/khoj_docker.yml --host="0.0.0.0" --port=8000 -vv

0
panchayat/__init__.py Normal file
View File

487
panchayat/vdb.py Normal file
View File

@@ -0,0 +1,487 @@
"""
Virtual Database that acts as an abstraction to the actual database.
VDB is the python representation of the on disk database.
VDB exposes methods to read/edit the database.
VDB can be serialized/deserialized to on disk db.
"""
import datetime
import re
from enum import Enum, auto
from typing import List, Optional, Tuple
import yaml
# from libgravatar import Gravatar # type: ignore
class Visibility(Enum):
"""
Enum to represent visibility levels for a post
"""
Aham = auto() # only visible to the author
Gram = auto() # visible to all logged in users
Lok = auto() # visible to everyone without log in
class VDB:
"""
Python abstraction of panchayat DB
"""
# pylint: disable=too-few-public-methods
def __init__(self, outfile: str = None):
self.users = UserList()
self.posts = PostTree()
self.outfile = outfile
def commit(self):
"""
serialize the virtual database to disk overwriting existing file
"""
if not self.outfile:
raise RuntimeError("Outfile is empty")
with open(self.outfile, 'w') as outfile:
yaml.dump(self, outfile)
# git commit
class User:
"""
Class to represent a user on panchayat
"""
# pylint: disable=too-few-public-methods
def __init__(self,
username: str,
password: str,
token: str = None,
email: str = None,
email_updates: bool = False):
# pylint: disable=too-many-arguments
self.username = username # primary key
self.password = password #hash
self.token = token
self.email = email
self.email_updates = email_updates
def __str__(self) -> str:
return self.username
def gravatar_url(self) -> str:
"""
Return gravatar image url for the user.
If user has email, then email is used to generate image.
Else username is used to generate image.
"""
key = self.email if self.email else self.username
return key
# libgrav = Gravatar(key)
# return libgrav.get_image(size=200, default="identicon", use_ssl=True)
class UserList(list):
"""
List of users
"""
def find(self, username: str) -> Optional[User]:
"""
Find user by username
"""
user = [user for user in self if user.username == username]
if not user:
return None
if len(user) != 1:
raise RuntimeError("More than one user found for username")
return user[0]
class Post: # pylint: disable=too-many-instance-attributes
"""
Class to represent a post on panchayat.
Inherited by LinkPost and TextPost
"""
def __init__(
self,
author: User,
title: str,
body: str,
visibility: Visibility = Visibility.Gram,
upvotes=None,
downvotes=None,
created=None,
parent: "Post" = None,
post_id: int = None,
):
# pylint: disable=too-many-arguments
self.post_id = post_id # need id for permalink
self.author = author
self.created = created \
if created is not None else datetime.datetime.now()
self.title = title
self.body = body
self.upvotes = upvotes if upvotes else set()
self.downvotes = downvotes if downvotes else set()
self.children: List[Post] = []
self.parent = parent
self.depth: int = parent.depth + 1 if parent else 0
if (self.parent and self.parent.visibility == visibility.Aham
and self.parent.author != self.author):
raise RuntimeError("Cannot reply to someone else's aham post")
self.visibility = visibility # set visibility using setter
@property
def target_visibility(self):
"""
Getter method for visibility
"""
return self._visibility
@property
def visibility(self):
"""
Getter method for visibility
Visibility can be lower than target if some ancestor has lower visibility.
When the ancestor reaches the requested target visibility,
self will automatically reach target visibility as well.
"""
if self.parent and self.parent.visibility.value < self._visibility.value:
return self.parent.visibility
return self._visibility
@visibility.setter
def visibility(self, other: Visibility):
"""
Setter method for visibility
If self is being made aham then parent and all descendants must be by same author
While setting visibility, all descendants are capped to self visibility level
This setter sets _visibility property. This sets the target visibility.
But, the actual visibility can stay lower if some ancestor has lower visibility.
"""
if other == Visibility.Aham:
if any([
descendant.author != self.author
for descendant in self.descendants
]):
raise RuntimeError(
"Cannot make post Aham if there are children owned by others"
)
self._visibility = other
def visibility_detail_string(self):
"""
The detailed string for visibility
"(Visibility.name requested)" if some descendant has a higher target visibility
"(Visibility.name pending)" if some ancestor is preventing this post from target visibility
"""
ret = ''
if self.target_visibility != self.visibility:
ret += f'({self.target_visibility.name} pending)'
if self.children:
max_visibility_request = max([
descendant.target_visibility for descendant in self.descendants
],
key=lambda x: x.value)
if max_visibility_request.value > self.target_visibility.value:
ret += f'({max_visibility_request.name} requested)'
return ret
def is_visible_to(self, user: User = None) -> bool:
"""
Returns True if self is visible to user, else False
"""
if self.visibility == Visibility.Lok:
return True
if self.visibility == Visibility.Gram and user:
return True
if self.visibility == Visibility.Aham and self.author == user:
return True
return False
@property
def descendants(self) -> List["Post"]:
"""
Return all my descendants with inorder traversal
Does not include self
"""
my_descendants = [] # list(self.children)
for child in sorted(self.children, key=lambda post: post.created):
my_descendants.append(child)
my_descendants.extend(child.descendants)
return my_descendants
@property
def family(self) -> List["Post"]:
"""
Return list of posts in family
Two posts belong to same family if they share the same TLP
"""
return self.tlp.descendants_and_i
@property
def descendants_and_i(self) -> List["Post"]:
"""
Return all my descendants with inorder traversal
Includes self
"""
return [self] + self.descendants
@property
def ancestors(self) -> List["Post"]:
"""
Return all my ancestors oldest first
Does not include self
"""
if self.parent:
return self.parent.ancestors + [self.parent]
return []
@property
def ancestry(self) -> List["Post"]:
"""
Return all my ancestors including self
"""
if self.parent:
return self.parent.ancestry + [self]
return [self]
@property
def tlp(self) -> "Post":
"""
Return my top level post
"""
if self.is_tlp():
return self
return self.parent.tlp #type: ignore
def is_tlp(self) -> bool:
"""
Return True if I am a top level post
"""
return self.depth == 0
def is_leaf(self) -> bool:
"""
Return True if I am a leaf post
"""
return not self.children
@property
def vote_count(self) -> int:
"""
Return the effective vote count of this post. upvote - downvote
"""
return len(self.upvotes) - len(self.downvotes)
def upvote_string(self) -> str:
"""
Return string of all users who have upvoted this post
"""
return ', '.join([user.username for user in self.upvotes])
def downvote_string(self) -> str:
"""
Return string of all users who have downvoted this post
"""
return ', '.join([user.username for user in self.downvotes])
def __str__(self) -> str:
if self.title:
return self.title
return self.body
def nullvote(self, user: User):
"""
Remove user's vote from this post
"""
self.upvotes.discard(user)
self.downvotes.discard(user)
def upvote(self, user: User):
"""
Upvote this post. Upvote is done by voiding previous vote and creating new one.
"""
self.nullvote(user)
self.upvotes.add(user)
def downvote(self, user: User):
"""
Downvote this post. Downvote is done by voiding previous vote and creating new one.
"""
self.nullvote(user)
self.downvotes.add(user)
def delete(self):
"""
Delete this post. Does not remove the post from db,
but only overwrites title and body with 'DELETED'.
This is done to not break other posts that have reference to the deleted one.
"""
self.title = "DELETED"
self.body = "DELETED"
def family_last_modified(self) -> datetime.datetime:
"""
Return when the post family was last modified
Max of created for all posts in family
"""
return max([post.created for post in self.family])
class LinkPost(Post):
"""
Class to represent a link post on panchayat
"""
def is_url(self) -> bool: # pylint: disable=missing-function-docstring, no-self-use
return True
class TextPost(Post):
"""
Class to represent a text post on panchayat
"""
def is_url(self) -> bool: #pylint: disable=missing-function-docstring, no-self-use
return False
@property
def html_body(self) -> str:
"""
Return html string with all urls in body converted to hrefs
Regex taken from https://urlregex.com/
Trailing period and parenthesis was appended to remove false positives
"""
# pylint: disable=line-too-long
url_regex = re.compile(
r'''(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+[^\. \)])'''
)
return url_regex.sub(r'<a href="\1" target="_blank">\1</a>', self.body)
class PostTree:
"""
Class to represent a tree of posts
"""
def __init__(self):
self.tlps = []
def zig_zag(self) -> List[Post]:
"""
Return all posts in zig zag order.
TLPs are in reverse chronological order.
Comments are ordered chrnonologically.
"""
all_posts = []
reverse_chrono_tlps = sorted(self.tlps,
key=lambda post: post.created,
reverse=True)
for tlp in reverse_chrono_tlps:
all_posts.append(tlp)
all_posts.extend(tlp.descendants)
return all_posts
def compressed_reverse_chrono_ancestry(
self, requesting_user: User) -> List[Tuple[Post, bool, bool]]:
"""
Returns a list of all posts with their ancestors.
The post is attached to two boolean fields wrapped inside a tuple
for use by the jinja template.
First boolean indicates whether this post must be highlighted.
Second boolean indicates whether a new TLP boundary has reached.
Ancestry is not repeated when the subsequent post shares ancestors.
This query is used in the activity view.
"""
ret: List[Tuple[Post, bool, bool]] = []
prev_ancestors: List[Post] = []
prev_tlp: Optional[Post] = None
for post in self.reverse_chrono():
if not post.is_visible_to(requesting_user):
continue
if prev_tlp and post.tlp is not prev_tlp:
# make tlp_switch true for the previous post
ret[-1] = (ret[-1][0], ret[-1][1], True)
ret.extend([(ancestor, False, False) for ancestor in post.ancestors
if ancestor not in prev_ancestors])
# add current post with highlight true
ret.append((post, True, False))
prev_ancestors = post.ancestry
prev_tlp = post.tlp
return ret
def all(self) -> List[Post]:
"""
Return list of all posts in any order.
Currently zig_zag order.
"""
return self.zig_zag()
def reverse_chrono(self) -> List[Post]:
"""
Return all posts in reverse chronological order
"""
return sorted(self.all(), key=lambda post: post.created, reverse=True)
def find(self, post_id: int) -> Optional[Post]:
"""
Find a post by post id
"""
post = [post for post in self.all() if post.post_id == post_id]
if not post:
return None
if len(post) != 1:
raise RuntimeError(
"There should only have been one post with a given id")
return post[0]
def insert(self, post: Post):
"""
Insert a post into the posttree.
If the post does not have an id already assign the smallest available one.
If post has a parent add the post as child of parent.
Else add the post as a TLP.
"""
if post.post_id is None:
post.post_id = max( #type:ignore
[post.post_id for post in self.all()],
default=0) + 1 #type: ignore
if self.find(post.post_id) is not None:
raise RuntimeError("Posttree already contains post with id")
if post.parent is None:
self.tlps.append(post)
else:
post.parent.children.append(post)
def tlp_count(self, user: User) -> int:
"""
Return #TLPs by the user
"""
return len([post for post in self.tlps if post.author == user])
def comment_count(self, user: User) -> int:
"""
Return #comments by user
"""
return len([
post for post in self.all()
if post.depth != 0 and post.author == user
])
def upvote_count(self, user: User) -> int:
"""
Return #upvotes by user
"""
return len([post for post in self.all() if user in post.upvotes])
def downvote_count(self, user: User) -> int:
"""
Return #downvotes by user
"""
return len([post for post in self.all() if user in post.downvotes])

View File

@@ -8,6 +8,7 @@ import json
# Internal Packages
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
from src.processor.panchayat.panchayat_to_jsonl import panchayat_to_jsonl
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.search_type import image_search, text_search
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@@ -70,6 +71,16 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
regenerate=regenerate,
filters=[DateFilter(), WordFilter(), FileFilter()])
# Initialize Panchayat Search
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
# Extract Entries, Generate Yaml Embeddings
model.panchayat_search = text_search.setup(
panchayat_to_jsonl,
config.content_type.panchayat,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')])
# Initialize Ledger Search
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
# Extract Entries, Generate Ledger Embeddings

View File

@@ -117,7 +117,7 @@
function populate_type_dropdown() {
// Populate type dropdown field with enabled search types only
var possible_search_types = ["org", "markdown", "ledger", "music", "image"];
var possible_search_types = ["org", "markdown", "ledger", "music", "image", "panchayat"];
fetch("/config/data")
.then(response => response.json())
.then(data => {

View File

@@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_
from src.utils.constants import empty_escape_sequences
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig
from typing import List
logger = logging.getLogger(__name__)
@@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files):
return entries, dict(transaction_to_file_map)
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]:
"Convert each Beancount transaction into a dictionary"
entry_maps = []
for entry in entries:
@@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
return entry_maps
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str:
"Convert each Beancount transaction dictionary to JSON and collate as JSONL"
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])

View File

@@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig
from typing import List
logger = logging.getLogger(__name__)
@@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files):
return entries, dict(entry_to_file_map)
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]:
"Convert each Markdown entries into a dictionary"
entry_maps = []
for entry in entries:

View File

@@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils import state
from src.utils.rawconfig import TextContentConfig
from typing import List
logger = logging.getLogger(__name__)
@@ -105,7 +107,7 @@ def extract_org_entries(org_files):
return entries, dict(entry_to_file_map)
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]:
"Convert Org-Mode entries into list of dictionary"
entry_maps = []
for entry in entries:

View File

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env python3
# Standard Packages
import json
import logging
import glob
import yaml
# Internal Packages
from panchayat import vdb
from src.utils.helpers import get_absolute_path, is_none_or_empty
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig
logger = logging.getLogger(__name__)
def panchayat_constructor(loader, node):
fields = loader.construct_mapping(node)
return vdb.VDB(**fields)
class VDBEntry():
post_id: str
body: str
title: str
author: str
def __init__(self, post_id, body, title, author):
self.post_id = post_id
self.body = body
self.title = title
self.author = author
# Define Functions
def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None):
# Input Validation
if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter):
print("At least one of input-files or input-file-filter is required to be specified")
exit(1)
# Get Markdown Files to Process
yaml_files = get_panchayat_files(config.input_files, config.input_filter)
output_file = config.compressed_jsonl
# Extract Entries from specified Markdown files
entries = extract_panchayat_entries(yaml_files)
# Process Each Entry from All Notes Files
jsonl_data = convert_panchayat_entries_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
compress_jsonl_data(jsonl_data, output_file)
elif output_file.suffix == ".jsonl":
dump_jsonl(jsonl_data, output_file)
return list(enumerate(entries))
def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
"Get the Panchayat file to process"
absolute_yaml_files, filtered_yaml_files = set(), set()
if yaml_files:
absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
if yaml_file_filter:
filtered_yaml_files = {
filtered_file
for filter in yaml_file_filter
for filtered_file in glob.glob(get_absolute_path(filter))
}
all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files)
files_with_non_yaml_extensions = {
yaml_file
for yaml_file
in all_yaml_files
if not yaml_file.endswith(".yaml") and not yaml_file.endswith(".yml")
}
if any(files_with_non_yaml_extensions):
logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
if verbose > 0:
print(f'Processing files: {all_yaml_files}')
return all_yaml_files
def extract_panchayat_entries(yaml_files):
"Extract entries by post from specified Yaml files"
entries = []
for yaml_file in yaml_files:
with open(yaml_file) as f:
raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)
seen_ids = set()
for post in raw_data.posts.zig_zag():
all_subposts = post.descendants_and_i
for subpost in all_subposts:
if subpost.post_id not in seen_ids:
seen_ids.add(subpost.post_id)
entry = dict()
entry['compiled'] = f"""body: {subpost.body}
author: {subpost.author.username}
title: {subpost.title}
created: {subpost.created}
upvotes: {len(subpost.upvotes)}"""
entry['raw'] = subpost.post_id
entries.append(entry)
return entries
def convert_panchayat_entries_to_jsonl(entries, verbose=0):
"Convert each Panchayat Yaml entry to JSON and collate as JSONL"
# jsonl = ''
# for entry in entries:
# entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]}
# # Convert Dictionary to JSON and Append to JSONL string
# jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
# if verbose > 0:
# logger.info(f"Converted {len(entries)} to jsonl format")
# return jsonl
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])

View File

@@ -109,6 +109,17 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
# query Panchayat yaml files
query_start = time.time()
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Image or t == None) and state.model.image_search:
# query images
query_start = time.time()

View File

@@ -1,6 +1,8 @@
# Standard Packages
from abc import ABC, abstractmethod
from typing import List, Set, Tuple
class BaseFilter(ABC):
@abstractmethod
@@ -12,5 +14,5 @@ class BaseFilter(ABC):
pass
@abstractmethod
def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]:
def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]:
pass

View File

@@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig, TextContentConfig
from src.utils.jsonl import load_jsonl
from typing import List
logger = logging.getLogger(__name__)
@@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5):
in hits[0:count]]
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel:
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model(search_config)

View File

@@ -7,7 +7,6 @@ from importlib.metadata import version
from src.utils.helpers import resolve_absolute_path
from src.utils.yaml import parse_config_from_file
def cli(args=None):
# Setup Argument Parser for the Commandline Interface
parser = argparse.ArgumentParser(description="Start Khoj; A Natural Language Search Engine for your personal Notes, Transactions and Photos")

View File

@@ -7,12 +7,15 @@ from pathlib import Path
from src.utils.rawconfig import ConversationProcessorConfig
from src.search_filter.base_filter import BaseFilter
from typing import List
class SearchType(str, Enum):
Org = "org"
Ledger = "ledger"
Music = "music"
Markdown = "markdown"
Panchayat = "panchayat"
Image = "image"
@@ -21,7 +24,7 @@ class ProcessorType(str, Enum):
class TextSearchModel():
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k):
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k):
self.entries = entries
self.corpus_embeddings = corpus_embeddings
self.bi_encoder = bi_encoder
@@ -45,6 +48,7 @@ class SearchModels():
ledger_search: TextSearchModel = None
music_search: TextSearchModel = None
markdown_search: TextSearchModel = None
panchayat_search: TextSearchModel = None
image_search: ImageSearchModel = None

View File

@@ -44,6 +44,7 @@ class ContentConfig(ConfigBase):
ledger: Optional[TextContentConfig]
image: Optional[ImageContentConfig]
music: Optional[TextContentConfig]
panchayat: Optional[TextContentConfig]
markdown: Optional[TextContentConfig]
class TextSearchConfig(ConfigBase):

View File

@@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel
from src.utils.helpers import LRU
from src.utils.rawconfig import FullConfig
from typing import List
# Application Global State
config = FullConfig()
model = SearchModels()
@@ -18,7 +20,7 @@ config_file: Path = None
verbose: int = 0
host: str = None
port: int = None
cli_args: list[str] = None
cli_args: List[str] = None
query_cache = LRU()
if torch.cuda.is_available():