mirror of
https://github.com/khoj-ai/khoj.git
synced 2026-05-13 21:41:41 +00:00
Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
21a9fbcea3 | ||
|
|
21eb58156c | ||
|
|
63f2312b84 | ||
|
|
f12ca56e93 | ||
|
|
ea62d47aa5 | ||
|
|
240901e07f | ||
|
|
13986f0e92 | ||
|
|
2e0ad6c8a1 | ||
|
|
365ab0c00e | ||
|
|
3b5f9814d8 | ||
|
|
da59ec2917 | ||
|
|
1e15d266da | ||
|
|
630abf2e17 | ||
|
|
84e3211a09 |
@@ -14,6 +14,9 @@ WORKDIR /app
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install --upgrade .
|
||||
|
||||
# https://stackoverflow.com/questions/64776990/python-docker-no-module-found
|
||||
ENV PYTHONPATH /app
|
||||
|
||||
# Run the Application
|
||||
# There are more arguments required for the application to run,
|
||||
# but these should be passed in through the docker-compose.yml file.
|
||||
|
||||
@@ -2,39 +2,14 @@ content-type:
|
||||
# The /data/folder/ prefix to the folders is here because this is
|
||||
# the directory to which the local files are copied in the docker-compose.
|
||||
# If changing, the docker-compose volumes should also be changed to match.
|
||||
org:
|
||||
panchayat :
|
||||
input-files: null
|
||||
input-filter: "/data/org/*.org"
|
||||
compressed-jsonl: "/data/embeddings/notes.jsonl.gz"
|
||||
embeddings-file: "/data/embeddings/note_embeddings.pt"
|
||||
index_heading_entries: false
|
||||
|
||||
markdown:
|
||||
input-files: null
|
||||
input-filter: "/data/markdown/*.md"
|
||||
compressed-jsonl: "/data/embeddings/markdown.jsonl.gz"
|
||||
embeddings-file: "/data/embeddings/markdown_embeddings.pt"
|
||||
|
||||
ledger:
|
||||
input-files: null
|
||||
input-filter: /data/ledger/*.beancount
|
||||
compressed-jsonl: /data/embeddings/transactions.jsonl.gz
|
||||
embeddings-file: /data/embeddings/transaction_embeddings.pt
|
||||
|
||||
image:
|
||||
input-directories: ["/data/images/"]
|
||||
embeddings-file: "/data/embeddings/image_embeddings.pt"
|
||||
batch-size: 50
|
||||
use-xmp-metadata: false
|
||||
|
||||
music:
|
||||
input-files: ["/data/music/music.org"]
|
||||
input-filter: null
|
||||
compressed-jsonl: "/data/embeddings/songs.jsonl.gz"
|
||||
embeddings-file: "/data/embeddings/song_embeddings.pt"
|
||||
input-filter: "/data/panchayat/*.yaml"
|
||||
compressed-jsonl: "/data/embeddings/new/panchyat.jsonl.gz"
|
||||
embeddings-file: "/data/embeddings/new/panchayat_embeddings.pt"
|
||||
|
||||
search-type:
|
||||
symmetric:
|
||||
symmetric:
|
||||
encoder: "sentence-transformers/all-MiniLM-L6-v2"
|
||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
model_directory: "/data/models/symmetric"
|
||||
@@ -44,10 +19,6 @@ search-type:
|
||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
model_directory: "/data/models/asymmetric"
|
||||
|
||||
image:
|
||||
encoder: "sentence-transformers/clip-ViT-B-32"
|
||||
model_directory: "/data/models/image_encoder"
|
||||
|
||||
processor:
|
||||
#conversation:
|
||||
# openai-api-key: null
|
||||
|
||||
@@ -1,35 +1,10 @@
|
||||
content-type:
|
||||
org:
|
||||
input-files: # ["/path/to/org-file.org"] REQUIRED IF input-filter IS NOT SET OR
|
||||
input-filter: # /path/to/org/*.org REQUIRED IF input-files IS NOT SET
|
||||
compressed-jsonl: "~/.khoj/content/org/org.jsonl.gz"
|
||||
embeddings-file: "~/.khoj/content/org/org_embeddings.pt"
|
||||
index_heading_entries: false # Set to true to index entries with empty body
|
||||
|
||||
markdown:
|
||||
input-files: # ["/path/to/markdown-file.md"] REQUIRED IF input-filter IS NOT SET OR
|
||||
input-filter: # "/path/to/markdown/*.md" REQUIRED IF input-files IS NOT SET
|
||||
compressed-jsonl: "~/.khoj/content/markdown/markdown.jsonl.gz"
|
||||
embeddings-file: "~/.khoj/content/markdown/markdown_embeddings.pt"
|
||||
|
||||
ledger:
|
||||
input-files: # ["/path/to/ledger-file.beancount"] REQUIRED IF input-filter is not set OR
|
||||
input-filter: # /path/to/ledger/*.beancount REQUIRED IF input-files is not set
|
||||
compressed-jsonl: "~/.khoj/content/ledger/ledger.jsonl.gz"
|
||||
embeddings-file: "~/.khoj/content/ledger/ledger_embeddings.pt"
|
||||
|
||||
image:
|
||||
input-directories: # ["/path/to/images/"] REQUIRED IF input-filter IS NOT SET OR
|
||||
input-filter: # /path/to/images/*.jpg REQUIRED IF input-directories IS NOT SET
|
||||
embeddings-file: "~/.khoj/content/image/image_embeddings.pt"
|
||||
batch-size: 50
|
||||
use-xmp-metadata: false
|
||||
|
||||
music:
|
||||
input-files: # ["/path/to/music-file.org"] REQUIRED IF input-filter IS NOT SET OR
|
||||
input-filter: # /path/to/music/*.org REQUIRED IF input-files IS NOT SET
|
||||
compressed-jsonl: "~/.khoj/content/music/music.jsonl.gz"
|
||||
embeddings-file: "~/.khoj/content/music/music_embeddings.pt"
|
||||
panchayat:
|
||||
input-files: null
|
||||
input-filter: ["/home/saba/projects/panchayat/panchayat/instance/*.yaml"]
|
||||
compressed-jsonl: "./khoj/embeddings/panchyat.jsonl.gz"
|
||||
embeddings-file: "./khoj/embeddings/panchayat_embeddings.pt"
|
||||
|
||||
search-type:
|
||||
symmetric:
|
||||
@@ -47,6 +22,3 @@ search-type:
|
||||
model_directory: "~/.khoj/search/image/"
|
||||
|
||||
processor:
|
||||
conversation:
|
||||
openai-api-key: # "YOUR_OPENAI_API_KEY"
|
||||
conversation-logfile: "~/.khoj/processor/conversation/conversation_logs.json"
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
version: "3.9"
|
||||
services:
|
||||
server:
|
||||
image: ghcr.io/debanjum/khoj:latest
|
||||
build: .
|
||||
# image: ghcr.io/debanjum/khoj:latest
|
||||
ports:
|
||||
# If changing the local port (left hand side), no other changes required.
|
||||
# If changing the remote port (right hand side),
|
||||
@@ -21,9 +22,10 @@ services:
|
||||
- ./tests/data/ledger/:/data/ledger/
|
||||
- ./tests/data/music/:/data/music/
|
||||
- ./tests/data/markdown/:/data/markdown/
|
||||
- /home/saba/projects/panchayat/panchayat/instance/:/data/panchayat/
|
||||
# Embeddings and models are populated after the first run
|
||||
# You can set these volumes to point to empty directories on host
|
||||
- ./tests/data/embeddings/:/data/embeddings/
|
||||
- ./tests/data/models/:/data/models/
|
||||
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
|
||||
command: --no-gui --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv
|
||||
command: --no-gui -c=config/khoj_docker.yml --host="0.0.0.0" --port=8000 -vv
|
||||
|
||||
0
panchayat/__init__.py
Normal file
0
panchayat/__init__.py
Normal file
487
panchayat/vdb.py
Normal file
487
panchayat/vdb.py
Normal file
@@ -0,0 +1,487 @@
|
||||
"""
|
||||
Virtual Database that acts as an abstraction to the actual database.
|
||||
VDB is the python representation of the on disk database.
|
||||
VDB exposes methods to read/edit the database.
|
||||
VDB can be serialized/deserialized to on disk db.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import re
|
||||
from enum import Enum, auto
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import yaml
|
||||
|
||||
# from libgravatar import Gravatar # type: ignore
|
||||
|
||||
|
||||
class Visibility(Enum):
|
||||
"""
|
||||
Enum to represent visibility levels for a post
|
||||
"""
|
||||
Aham = auto() # only visible to the author
|
||||
Gram = auto() # visible to all logged in users
|
||||
Lok = auto() # visible to everyone without log in
|
||||
|
||||
|
||||
class VDB:
|
||||
"""
|
||||
Python abstraction of panchayat DB
|
||||
"""
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
def __init__(self, outfile: str = None):
|
||||
self.users = UserList()
|
||||
self.posts = PostTree()
|
||||
self.outfile = outfile
|
||||
|
||||
def commit(self):
|
||||
"""
|
||||
serialize the virtual database to disk overwriting existing file
|
||||
"""
|
||||
if not self.outfile:
|
||||
raise RuntimeError("Outfile is empty")
|
||||
|
||||
with open(self.outfile, 'w') as outfile:
|
||||
yaml.dump(self, outfile)
|
||||
|
||||
# git commit
|
||||
|
||||
|
||||
class User:
|
||||
"""
|
||||
Class to represent a user on panchayat
|
||||
"""
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
def __init__(self,
|
||||
username: str,
|
||||
password: str,
|
||||
token: str = None,
|
||||
email: str = None,
|
||||
email_updates: bool = False):
|
||||
# pylint: disable=too-many-arguments
|
||||
self.username = username # primary key
|
||||
self.password = password #hash
|
||||
self.token = token
|
||||
self.email = email
|
||||
self.email_updates = email_updates
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.username
|
||||
|
||||
def gravatar_url(self) -> str:
|
||||
"""
|
||||
Return gravatar image url for the user.
|
||||
If user has email, then email is used to generate image.
|
||||
Else username is used to generate image.
|
||||
"""
|
||||
key = self.email if self.email else self.username
|
||||
return key
|
||||
# libgrav = Gravatar(key)
|
||||
# return libgrav.get_image(size=200, default="identicon", use_ssl=True)
|
||||
|
||||
|
||||
class UserList(list):
|
||||
"""
|
||||
List of users
|
||||
"""
|
||||
def find(self, username: str) -> Optional[User]:
|
||||
"""
|
||||
Find user by username
|
||||
"""
|
||||
user = [user for user in self if user.username == username]
|
||||
if not user:
|
||||
return None
|
||||
if len(user) != 1:
|
||||
raise RuntimeError("More than one user found for username")
|
||||
return user[0]
|
||||
|
||||
|
||||
class Post: # pylint: disable=too-many-instance-attributes
|
||||
"""
|
||||
Class to represent a post on panchayat.
|
||||
Inherited by LinkPost and TextPost
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
author: User,
|
||||
title: str,
|
||||
body: str,
|
||||
visibility: Visibility = Visibility.Gram,
|
||||
upvotes=None,
|
||||
downvotes=None,
|
||||
created=None,
|
||||
parent: "Post" = None,
|
||||
post_id: int = None,
|
||||
):
|
||||
# pylint: disable=too-many-arguments
|
||||
self.post_id = post_id # need id for permalink
|
||||
self.author = author
|
||||
self.created = created \
|
||||
if created is not None else datetime.datetime.now()
|
||||
self.title = title
|
||||
self.body = body
|
||||
self.upvotes = upvotes if upvotes else set()
|
||||
self.downvotes = downvotes if downvotes else set()
|
||||
self.children: List[Post] = []
|
||||
self.parent = parent
|
||||
self.depth: int = parent.depth + 1 if parent else 0
|
||||
|
||||
if (self.parent and self.parent.visibility == visibility.Aham
|
||||
and self.parent.author != self.author):
|
||||
raise RuntimeError("Cannot reply to someone else's aham post")
|
||||
self.visibility = visibility # set visibility using setter
|
||||
|
||||
@property
|
||||
def target_visibility(self):
|
||||
"""
|
||||
Getter method for visibility
|
||||
"""
|
||||
return self._visibility
|
||||
|
||||
@property
|
||||
def visibility(self):
|
||||
"""
|
||||
Getter method for visibility
|
||||
|
||||
Visibility can be lower than target if some ancestor has lower visibility.
|
||||
When the ancestor reaches the requested target visibility,
|
||||
self will automatically reach target visibility as well.
|
||||
"""
|
||||
if self.parent and self.parent.visibility.value < self._visibility.value:
|
||||
return self.parent.visibility
|
||||
return self._visibility
|
||||
|
||||
@visibility.setter
|
||||
def visibility(self, other: Visibility):
|
||||
"""
|
||||
Setter method for visibility
|
||||
If self is being made aham then parent and all descendants must be by same author
|
||||
While setting visibility, all descendants are capped to self visibility level
|
||||
|
||||
This setter sets _visibility property. This sets the target visibility.
|
||||
But, the actual visibility can stay lower if some ancestor has lower visibility.
|
||||
"""
|
||||
if other == Visibility.Aham:
|
||||
if any([
|
||||
descendant.author != self.author
|
||||
for descendant in self.descendants
|
||||
]):
|
||||
raise RuntimeError(
|
||||
"Cannot make post Aham if there are children owned by others"
|
||||
)
|
||||
|
||||
self._visibility = other
|
||||
|
||||
def visibility_detail_string(self):
|
||||
"""
|
||||
The detailed string for visibility
|
||||
"(Visibility.name requested)" if some descendant has a higher target visibility
|
||||
"(Visibility.name pending)" if some ancestor is preventing this post from target visibility
|
||||
"""
|
||||
ret = ''
|
||||
if self.target_visibility != self.visibility:
|
||||
ret += f'({self.target_visibility.name} pending)'
|
||||
if self.children:
|
||||
max_visibility_request = max([
|
||||
descendant.target_visibility for descendant in self.descendants
|
||||
],
|
||||
key=lambda x: x.value)
|
||||
if max_visibility_request.value > self.target_visibility.value:
|
||||
ret += f'({max_visibility_request.name} requested)'
|
||||
return ret
|
||||
|
||||
def is_visible_to(self, user: User = None) -> bool:
|
||||
"""
|
||||
Returns True if self is visible to user, else False
|
||||
"""
|
||||
if self.visibility == Visibility.Lok:
|
||||
return True
|
||||
if self.visibility == Visibility.Gram and user:
|
||||
return True
|
||||
if self.visibility == Visibility.Aham and self.author == user:
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def descendants(self) -> List["Post"]:
|
||||
"""
|
||||
Return all my descendants with inorder traversal
|
||||
Does not include self
|
||||
"""
|
||||
my_descendants = [] # list(self.children)
|
||||
for child in sorted(self.children, key=lambda post: post.created):
|
||||
my_descendants.append(child)
|
||||
my_descendants.extend(child.descendants)
|
||||
return my_descendants
|
||||
|
||||
@property
|
||||
def family(self) -> List["Post"]:
|
||||
"""
|
||||
Return list of posts in family
|
||||
Two posts belong to same family if they share the same TLP
|
||||
"""
|
||||
return self.tlp.descendants_and_i
|
||||
|
||||
@property
|
||||
def descendants_and_i(self) -> List["Post"]:
|
||||
"""
|
||||
Return all my descendants with inorder traversal
|
||||
Includes self
|
||||
"""
|
||||
return [self] + self.descendants
|
||||
|
||||
@property
|
||||
def ancestors(self) -> List["Post"]:
|
||||
"""
|
||||
Return all my ancestors oldest first
|
||||
Does not include self
|
||||
"""
|
||||
if self.parent:
|
||||
return self.parent.ancestors + [self.parent]
|
||||
return []
|
||||
|
||||
@property
|
||||
def ancestry(self) -> List["Post"]:
|
||||
"""
|
||||
Return all my ancestors including self
|
||||
"""
|
||||
if self.parent:
|
||||
return self.parent.ancestry + [self]
|
||||
return [self]
|
||||
|
||||
@property
|
||||
def tlp(self) -> "Post":
|
||||
"""
|
||||
Return my top level post
|
||||
"""
|
||||
if self.is_tlp():
|
||||
return self
|
||||
return self.parent.tlp #type: ignore
|
||||
|
||||
def is_tlp(self) -> bool:
|
||||
"""
|
||||
Return True if I am a top level post
|
||||
"""
|
||||
return self.depth == 0
|
||||
|
||||
def is_leaf(self) -> bool:
|
||||
"""
|
||||
Return True if I am a leaf post
|
||||
"""
|
||||
return not self.children
|
||||
|
||||
@property
|
||||
def vote_count(self) -> int:
|
||||
"""
|
||||
Return the effective vote count of this post. upvote - downvote
|
||||
"""
|
||||
return len(self.upvotes) - len(self.downvotes)
|
||||
|
||||
def upvote_string(self) -> str:
|
||||
"""
|
||||
Return string of all users who have upvoted this post
|
||||
"""
|
||||
return ', '.join([user.username for user in self.upvotes])
|
||||
|
||||
def downvote_string(self) -> str:
|
||||
"""
|
||||
Return string of all users who have downvoted this post
|
||||
"""
|
||||
return ', '.join([user.username for user in self.downvotes])
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.title:
|
||||
return self.title
|
||||
return self.body
|
||||
|
||||
def nullvote(self, user: User):
|
||||
"""
|
||||
Remove user's vote from this post
|
||||
"""
|
||||
self.upvotes.discard(user)
|
||||
self.downvotes.discard(user)
|
||||
|
||||
def upvote(self, user: User):
|
||||
"""
|
||||
Upvote this post. Upvote is done by voiding previous vote and creating new one.
|
||||
"""
|
||||
self.nullvote(user)
|
||||
self.upvotes.add(user)
|
||||
|
||||
def downvote(self, user: User):
|
||||
"""
|
||||
Downvote this post. Downvote is done by voiding previous vote and creating new one.
|
||||
"""
|
||||
self.nullvote(user)
|
||||
self.downvotes.add(user)
|
||||
|
||||
def delete(self):
|
||||
"""
|
||||
Delete this post. Does not remove the post from db,
|
||||
but only overwrites title and body with 'DELETED'.
|
||||
This is done to not break other posts that have reference to the deleted one.
|
||||
"""
|
||||
self.title = "DELETED"
|
||||
self.body = "DELETED"
|
||||
|
||||
def family_last_modified(self) -> datetime.datetime:
|
||||
"""
|
||||
Return when the post family was last modified
|
||||
Max of created for all posts in family
|
||||
"""
|
||||
return max([post.created for post in self.family])
|
||||
|
||||
|
||||
class LinkPost(Post):
|
||||
"""
|
||||
Class to represent a link post on panchayat
|
||||
"""
|
||||
def is_url(self) -> bool: # pylint: disable=missing-function-docstring, no-self-use
|
||||
return True
|
||||
|
||||
|
||||
class TextPost(Post):
|
||||
"""
|
||||
Class to represent a text post on panchayat
|
||||
"""
|
||||
def is_url(self) -> bool: #pylint: disable=missing-function-docstring, no-self-use
|
||||
return False
|
||||
|
||||
@property
|
||||
def html_body(self) -> str:
|
||||
"""
|
||||
Return html string with all urls in body converted to hrefs
|
||||
Regex taken from https://urlregex.com/
|
||||
Trailing period and parenthesis was appended to remove false positives
|
||||
"""
|
||||
# pylint: disable=line-too-long
|
||||
url_regex = re.compile(
|
||||
r'''(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+[^\. \)])'''
|
||||
)
|
||||
return url_regex.sub(r'<a href="\1" target="_blank">\1</a>', self.body)
|
||||
|
||||
|
||||
class PostTree:
|
||||
"""
|
||||
Class to represent a tree of posts
|
||||
"""
|
||||
def __init__(self):
|
||||
self.tlps = []
|
||||
|
||||
def zig_zag(self) -> List[Post]:
|
||||
"""
|
||||
Return all posts in zig zag order.
|
||||
TLPs are in reverse chronological order.
|
||||
Comments are ordered chrnonologically.
|
||||
"""
|
||||
all_posts = []
|
||||
reverse_chrono_tlps = sorted(self.tlps,
|
||||
key=lambda post: post.created,
|
||||
reverse=True)
|
||||
for tlp in reverse_chrono_tlps:
|
||||
all_posts.append(tlp)
|
||||
all_posts.extend(tlp.descendants)
|
||||
return all_posts
|
||||
|
||||
def compressed_reverse_chrono_ancestry(
|
||||
self, requesting_user: User) -> List[Tuple[Post, bool, bool]]:
|
||||
"""
|
||||
Returns a list of all posts with their ancestors.
|
||||
The post is attached to two boolean fields wrapped inside a tuple
|
||||
for use by the jinja template.
|
||||
First boolean indicates whether this post must be highlighted.
|
||||
Second boolean indicates whether a new TLP boundary has reached.
|
||||
Ancestry is not repeated when the subsequent post shares ancestors.
|
||||
This query is used in the activity view.
|
||||
"""
|
||||
ret: List[Tuple[Post, bool, bool]] = []
|
||||
prev_ancestors: List[Post] = []
|
||||
prev_tlp: Optional[Post] = None
|
||||
for post in self.reverse_chrono():
|
||||
if not post.is_visible_to(requesting_user):
|
||||
continue
|
||||
if prev_tlp and post.tlp is not prev_tlp:
|
||||
# make tlp_switch true for the previous post
|
||||
ret[-1] = (ret[-1][0], ret[-1][1], True)
|
||||
ret.extend([(ancestor, False, False) for ancestor in post.ancestors
|
||||
if ancestor not in prev_ancestors])
|
||||
# add current post with highlight true
|
||||
ret.append((post, True, False))
|
||||
prev_ancestors = post.ancestry
|
||||
prev_tlp = post.tlp
|
||||
return ret
|
||||
|
||||
def all(self) -> List[Post]:
|
||||
"""
|
||||
Return list of all posts in any order.
|
||||
Currently zig_zag order.
|
||||
"""
|
||||
return self.zig_zag()
|
||||
|
||||
def reverse_chrono(self) -> List[Post]:
|
||||
"""
|
||||
Return all posts in reverse chronological order
|
||||
"""
|
||||
return sorted(self.all(), key=lambda post: post.created, reverse=True)
|
||||
|
||||
def find(self, post_id: int) -> Optional[Post]:
|
||||
"""
|
||||
Find a post by post id
|
||||
"""
|
||||
post = [post for post in self.all() if post.post_id == post_id]
|
||||
if not post:
|
||||
return None
|
||||
if len(post) != 1:
|
||||
raise RuntimeError(
|
||||
"There should only have been one post with a given id")
|
||||
return post[0]
|
||||
|
||||
def insert(self, post: Post):
|
||||
"""
|
||||
Insert a post into the posttree.
|
||||
If the post does not have an id already assign the smallest available one.
|
||||
If post has a parent add the post as child of parent.
|
||||
Else add the post as a TLP.
|
||||
"""
|
||||
if post.post_id is None:
|
||||
post.post_id = max( #type:ignore
|
||||
[post.post_id for post in self.all()],
|
||||
default=0) + 1 #type: ignore
|
||||
|
||||
if self.find(post.post_id) is not None:
|
||||
raise RuntimeError("Posttree already contains post with id")
|
||||
|
||||
if post.parent is None:
|
||||
self.tlps.append(post)
|
||||
else:
|
||||
post.parent.children.append(post)
|
||||
|
||||
def tlp_count(self, user: User) -> int:
|
||||
"""
|
||||
Return #TLPs by the user
|
||||
"""
|
||||
return len([post for post in self.tlps if post.author == user])
|
||||
|
||||
def comment_count(self, user: User) -> int:
|
||||
"""
|
||||
Return #comments by user
|
||||
"""
|
||||
return len([
|
||||
post for post in self.all()
|
||||
if post.depth != 0 and post.author == user
|
||||
])
|
||||
|
||||
def upvote_count(self, user: User) -> int:
|
||||
"""
|
||||
Return #upvotes by user
|
||||
"""
|
||||
return len([post for post in self.all() if user in post.upvotes])
|
||||
|
||||
def downvote_count(self, user: User) -> int:
|
||||
"""
|
||||
Return #downvotes by user
|
||||
"""
|
||||
return len([post for post in self.all() if user in post.downvotes])
|
||||
|
||||
@@ -8,6 +8,7 @@ import json
|
||||
# Internal Packages
|
||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
||||
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
|
||||
from src.processor.panchayat.panchayat_to_jsonl import panchayat_to_jsonl
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.search_type import image_search, text_search
|
||||
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
@@ -70,6 +71,16 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()])
|
||||
|
||||
# Initialize Panchayat Search
|
||||
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
|
||||
# Extract Entries, Generate Yaml Embeddings
|
||||
model.panchayat_search = text_search.setup(
|
||||
panchayat_to_jsonl,
|
||||
config.content_type.panchayat,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(entry_key='compiled'), WordFilter(entry_key='compiled')])
|
||||
|
||||
# Initialize Ledger Search
|
||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||
# Extract Entries, Generate Ledger Embeddings
|
||||
|
||||
@@ -117,7 +117,7 @@
|
||||
|
||||
function populate_type_dropdown() {
|
||||
// Populate type dropdown field with enabled search types only
|
||||
var possible_search_types = ["org", "markdown", "ledger", "music", "image"];
|
||||
var possible_search_types = ["org", "markdown", "ledger", "music", "image", "panchayat"];
|
||||
fetch("/config/data")
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
|
||||
@@ -12,6 +12,7 @@ from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_
|
||||
from src.utils.constants import empty_escape_sequences
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
from typing import List
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -112,7 +113,7 @@ def extract_beancount_transactions(beancount_files):
|
||||
return entries, dict(transaction_to_file_map)
|
||||
|
||||
|
||||
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
|
||||
def convert_transactions_to_maps(entries: List[str], transaction_to_file_map) -> List[dict]:
|
||||
"Convert each Beancount transaction into a dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
@@ -123,6 +124,6 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
|
||||
return entry_maps
|
||||
|
||||
|
||||
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
|
||||
def convert_transaction_maps_to_jsonl(entries: List[dict]) -> str:
|
||||
"Convert each Beancount transaction dictionary to JSON and collate as JSONL"
|
||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||
|
||||
@@ -13,6 +13,7 @@ from src.utils.constants import empty_escape_sequences
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -111,7 +112,7 @@ def extract_markdown_entries(markdown_files):
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
||||
|
||||
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
|
||||
def convert_markdown_entries_to_maps(entries: List[str], entry_to_file_map) -> List[dict]:
|
||||
"Convert each Markdown entries into a dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
|
||||
@@ -14,6 +14,8 @@ from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils import state
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -105,7 +107,7 @@ def extract_org_entries(org_files):
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
||||
|
||||
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
|
||||
def convert_org_nodes_to_entries(entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[dict]:
|
||||
"Convert Org-Mode entries into list of dictionary"
|
||||
entry_maps = []
|
||||
for entry in entries:
|
||||
|
||||
0
src/processor/panchayat/__init__.py
Normal file
0
src/processor/panchayat/__init__.py
Normal file
138
src/processor/panchayat/panchayat_to_jsonl.py
Normal file
138
src/processor/panchayat/panchayat_to_jsonl.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Standard Packages
|
||||
import json
|
||||
import logging
|
||||
import glob
|
||||
import yaml
|
||||
|
||||
# Internal Packages
|
||||
from panchayat import vdb
|
||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from src.utils.rawconfig import TextContentConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def panchayat_constructor(loader, node):
|
||||
fields = loader.construct_mapping(node)
|
||||
return vdb.VDB(**fields)
|
||||
|
||||
|
||||
class VDBEntry():
|
||||
post_id: str
|
||||
body: str
|
||||
title: str
|
||||
author: str
|
||||
|
||||
def __init__(self, post_id, body, title, author):
|
||||
self.post_id = post_id
|
||||
self.body = body
|
||||
self.title = title
|
||||
self.author = author
|
||||
|
||||
|
||||
# Define Functions
|
||||
def panchayat_to_jsonl(config: TextContentConfig, previous_entries = None):
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(config.input_files) and is_none_or_empty(config.input_filter):
|
||||
print("At least one of input-files or input-file-filter is required to be specified")
|
||||
exit(1)
|
||||
|
||||
# Get Markdown Files to Process
|
||||
yaml_files = get_panchayat_files(config.input_files, config.input_filter)
|
||||
|
||||
output_file = config.compressed_jsonl
|
||||
|
||||
# Extract Entries from specified Markdown files
|
||||
entries = extract_panchayat_entries(yaml_files)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_data = convert_panchayat_entries_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
|
||||
return list(enumerate(entries))
|
||||
|
||||
|
||||
def get_panchayat_files(yaml_files=None, yaml_file_filter=None, verbose=0):
|
||||
"Get the Panchayat file to process"
|
||||
absolute_yaml_files, filtered_yaml_files = set(), set()
|
||||
if yaml_files:
|
||||
absolute_yaml_files = {get_absolute_path(yaml_file) for yaml_file in yaml_files}
|
||||
if yaml_file_filter:
|
||||
filtered_yaml_files = {
|
||||
filtered_file
|
||||
for filter in yaml_file_filter
|
||||
for filtered_file in glob.glob(get_absolute_path(filter))
|
||||
}
|
||||
|
||||
all_yaml_files = sorted(absolute_yaml_files | filtered_yaml_files)
|
||||
|
||||
files_with_non_yaml_extensions = {
|
||||
yaml_file
|
||||
for yaml_file
|
||||
in all_yaml_files
|
||||
if not yaml_file.endswith(".yaml") and not yaml_file.endswith(".yml")
|
||||
}
|
||||
|
||||
if any(files_with_non_yaml_extensions):
|
||||
logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_yaml_extensions}")
|
||||
|
||||
if verbose > 0:
|
||||
print(f'Processing files: {all_yaml_files}')
|
||||
|
||||
return all_yaml_files
|
||||
|
||||
|
||||
def extract_panchayat_entries(yaml_files):
|
||||
"Extract entries by post from specified Yaml files"
|
||||
|
||||
entries = []
|
||||
for yaml_file in yaml_files:
|
||||
with open(yaml_file) as f:
|
||||
|
||||
raw_data = yaml.load(f, Loader=yaml.UnsafeLoader)
|
||||
|
||||
seen_ids = set()
|
||||
|
||||
for post in raw_data.posts.zig_zag():
|
||||
all_subposts = post.descendants_and_i
|
||||
for subpost in all_subposts:
|
||||
if subpost.post_id not in seen_ids:
|
||||
seen_ids.add(subpost.post_id)
|
||||
entry = dict()
|
||||
|
||||
entry['compiled'] = f"""body: {subpost.body}
|
||||
author: {subpost.author.username}
|
||||
title: {subpost.title}
|
||||
created: {subpost.created}
|
||||
upvotes: {len(subpost.upvotes)}"""
|
||||
|
||||
entry['raw'] = subpost.post_id
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def convert_panchayat_entries_to_jsonl(entries, verbose=0):
|
||||
"Convert each Panchayat Yaml entry to JSON and collate as JSONL"
|
||||
# jsonl = ''
|
||||
# for entry in entries:
|
||||
# entry_dict = {'compiled': f'body: {entry["body"]} author: {entry["author"]} title: {entry["title"]}', 'raw': entry["post_id"]}
|
||||
# # Convert Dictionary to JSON and Append to JSONL string
|
||||
# jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||
|
||||
|
||||
# if verbose > 0:
|
||||
# logger.info(f"Converted {len(entries)} to jsonl format")
|
||||
|
||||
# return jsonl
|
||||
|
||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||
|
||||
@@ -109,6 +109,17 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
collate_end = time.time()
|
||||
|
||||
if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
|
||||
# query Panchayat yaml files
|
||||
query_start = time.time()
|
||||
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r)
|
||||
query_end = time.time()
|
||||
|
||||
# collate and return results
|
||||
collate_start = time.time()
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
collate_end = time.time()
|
||||
|
||||
if (t == SearchType.Image or t == None) and state.model.image_search:
|
||||
# query images
|
||||
query_start = time.time()
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# Standard Packages
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from typing import List, Set, Tuple
|
||||
|
||||
|
||||
class BaseFilter(ABC):
|
||||
@abstractmethod
|
||||
@@ -12,5 +14,5 @@ class BaseFilter(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, query:str, raw_entries:list[str]) -> tuple[str, set[int]]:
|
||||
def apply(self, query:str, raw_entries:List[str]) -> Tuple[str, Set[int]]:
|
||||
pass
|
||||
@@ -14,6 +14,7 @@ from src.utils.config import TextSearchModel
|
||||
from src.utils.rawconfig import TextSearchConfig, TextContentConfig
|
||||
from src.utils.jsonl import load_jsonl
|
||||
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -179,7 +180,7 @@ def collate_results(hits, entries, count=5):
|
||||
in hits[0:count]]
|
||||
|
||||
|
||||
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
|
||||
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: List[BaseFilter] = []) -> TextSearchModel:
|
||||
# Initialize Model
|
||||
bi_encoder, cross_encoder, top_k = initialize_model(search_config)
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ from importlib.metadata import version
|
||||
from src.utils.helpers import resolve_absolute_path
|
||||
from src.utils.yaml import parse_config_from_file
|
||||
|
||||
|
||||
def cli(args=None):
|
||||
# Setup Argument Parser for the Commandline Interface
|
||||
parser = argparse.ArgumentParser(description="Start Khoj; A Natural Language Search Engine for your personal Notes, Transactions and Photos")
|
||||
|
||||
@@ -7,12 +7,15 @@ from pathlib import Path
|
||||
from src.utils.rawconfig import ConversationProcessorConfig
|
||||
from src.search_filter.base_filter import BaseFilter
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
class SearchType(str, Enum):
|
||||
Org = "org"
|
||||
Ledger = "ledger"
|
||||
Music = "music"
|
||||
Markdown = "markdown"
|
||||
Panchayat = "panchayat"
|
||||
Image = "image"
|
||||
|
||||
|
||||
@@ -21,7 +24,7 @@ class ProcessorType(str, Enum):
|
||||
|
||||
|
||||
class TextSearchModel():
|
||||
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: list[BaseFilter], top_k):
|
||||
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, filters: List[BaseFilter], top_k):
|
||||
self.entries = entries
|
||||
self.corpus_embeddings = corpus_embeddings
|
||||
self.bi_encoder = bi_encoder
|
||||
@@ -45,6 +48,7 @@ class SearchModels():
|
||||
ledger_search: TextSearchModel = None
|
||||
music_search: TextSearchModel = None
|
||||
markdown_search: TextSearchModel = None
|
||||
panchayat_search: TextSearchModel = None
|
||||
image_search: ImageSearchModel = None
|
||||
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ class ContentConfig(ConfigBase):
|
||||
ledger: Optional[TextContentConfig]
|
||||
image: Optional[ImageContentConfig]
|
||||
music: Optional[TextContentConfig]
|
||||
panchayat: Optional[TextContentConfig]
|
||||
markdown: Optional[TextContentConfig]
|
||||
|
||||
class TextSearchConfig(ConfigBase):
|
||||
|
||||
@@ -10,6 +10,8 @@ from src.utils.config import SearchModels, ProcessorConfigModel
|
||||
from src.utils.helpers import LRU
|
||||
from src.utils.rawconfig import FullConfig
|
||||
|
||||
from typing import List
|
||||
|
||||
# Application Global State
|
||||
config = FullConfig()
|
||||
model = SearchModels()
|
||||
@@ -18,7 +20,7 @@ config_file: Path = None
|
||||
verbose: int = 0
|
||||
host: str = None
|
||||
port: int = None
|
||||
cli_args: list[str] = None
|
||||
cli_args: List[str] = None
|
||||
query_cache = LRU()
|
||||
|
||||
if torch.cuda.is_available():
|
||||
|
||||
Reference in New Issue
Block a user