mirror of
https://github.com/khoj-ai/khoj.git
synced 2026-05-13 21:41:41 +00:00
Compare commits
48 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3e59be7f1d | ||
|
|
d078e7b1f6 | ||
|
|
4d910936b7 | ||
|
|
5c7d7f558d | ||
|
|
5f2be2a9bb | ||
|
|
3a1c5a6dab | ||
|
|
429e1b4b48 | ||
|
|
83e1088d42 | ||
|
|
71e8ddd9a2 | ||
|
|
d00c5da8b7 | ||
|
|
3e3a1ecbc8 | ||
|
|
ef6a0044f4 | ||
|
|
c73feebf25 | ||
|
|
ad41ef3991 | ||
|
|
1482fd4d4d | ||
|
|
b02323ade6 | ||
|
|
89c7819cb7 | ||
|
|
6a0297cc86 | ||
|
|
7669b85da6 | ||
|
|
6e70b914c2 | ||
|
|
9bcca43299 | ||
|
|
1673bb5558 | ||
|
|
88d1a29a84 | ||
|
|
da98b92dd4 | ||
|
|
7ad96036b0 | ||
|
|
58d86d7876 | ||
|
|
a15711e635 | ||
|
|
e590d75b20 | ||
|
|
49ab201c30 | ||
|
|
ba47f2ab39 | ||
|
|
874cffd256 | ||
|
|
52f68167ce | ||
|
|
f08e9539f1 | ||
|
|
37f7f9fd1d | ||
|
|
b9fb656657 | ||
|
|
86e2bec9a0 | ||
|
|
c2249eadb2 | ||
|
|
b2718d330c | ||
|
|
31e933207f | ||
|
|
9c76150895 | ||
|
|
83ed8561ee | ||
|
|
88c42b3043 | ||
|
|
6308388dfc | ||
|
|
802472cd99 | ||
|
|
f664a74e77 | ||
|
|
bfd516c1a4 | ||
|
|
58c2c3b71a | ||
|
|
effb52f859 |
108
.github/workflows/build_desktop.yml
vendored
Normal file
108
.github/workflows/build_desktop.yml
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
name: desktop_dev_build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
publish_desktop_apps:
|
||||
name: 🖥️ Publish Desktop Apps
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-latest
|
||||
extension: deb
|
||||
- os: macos-latest
|
||||
extension: dmg
|
||||
- os: windows-latest
|
||||
extension: exe
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python 3.9
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.9'
|
||||
|
||||
- name: ⏬️ Install Dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt install libegl1 libxcb-xinerama0 python3-tk -y
|
||||
fi
|
||||
python -m pip install --upgrade pip
|
||||
pip install pyinstaller
|
||||
|
||||
- name: ⬇️ Install Khoj App
|
||||
run: |
|
||||
pip install --upgrade .
|
||||
|
||||
- name: 📦 Package Khoj App
|
||||
shell: bash
|
||||
run: |
|
||||
# Setup Environment for Reproducible Builds
|
||||
export PYTHONHASHSEED=42
|
||||
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
|
||||
|
||||
pyinstaller --noconfirm Khoj.spec
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
mv dist/Khoj.exe dist/khoj_dev_amd64.exe
|
||||
fi
|
||||
|
||||
- name: 💻 Create Mac App DMG
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
# Install Mac DMG Creator
|
||||
brew install create-dmg
|
||||
# Copy app to separate dmg folder
|
||||
mkdir -p dist/dmg && cp -r dist/Khoj.app dist/dmg
|
||||
# Create disk image with the app
|
||||
create-dmg \
|
||||
--volname "Khoj" \
|
||||
--volicon "src/khoj/interface/web/assets/icons/favicon.icns" \
|
||||
--window-pos 200 120 \
|
||||
--window-size 600 300 \
|
||||
--icon-size 100 \
|
||||
--icon "Khoj.app" 175 120 \
|
||||
--hide-extension "Khoj.app" \
|
||||
--app-drop-link 425 120 \
|
||||
"dist/khoj_dev_amd64.dmg" \
|
||||
"dist/dmg/"
|
||||
|
||||
- uses: ruby/setup-ruby@v1
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
with:
|
||||
ruby-version: '3.0'
|
||||
|
||||
- name: 🐧 Create Debian Package
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
shell: bash
|
||||
run: |
|
||||
# Install Debian Packager
|
||||
gem install fpm
|
||||
|
||||
# Copy app files into expected output directory structure
|
||||
mkdir -p package/opt package/usr/share/applications package/usr/share/icons/hicolor/128x128/apps
|
||||
cp -r dist/Khoj package/opt/Khoj
|
||||
cp src/khoj/interface/web/assets/icons/favicon-128x128.png package/usr/share/icons/hicolor/128x128/apps/Khoj.png
|
||||
cp Khoj.desktop package/usr/share/applications
|
||||
|
||||
# Fix permissions to be usable by non-root users
|
||||
find package/usr/share -type f -exec chmod 644 -- {} +
|
||||
chmod 755 package/opt/Khoj
|
||||
|
||||
# Package the app
|
||||
fpm -C package -s dir -t deb -n Khoj -p dist/khoj_dev_amd64.deb
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: khoj_dev_amd64.${{matrix.extension}}
|
||||
path: dist/khoj_dev_amd64.${{matrix.extension}}
|
||||
retention-days: 1
|
||||
2
.github/workflows/dockerize.yml
vendored
2
.github/workflows/dockerize.yml
vendored
@@ -44,4 +44,4 @@ jobs:
|
||||
push: true
|
||||
tags: ghcr.io/${{ github.repository }}:${{ env.DOCKER_IMAGE_TAG }}
|
||||
build-args: |
|
||||
PORT=8000
|
||||
PORT=42110
|
||||
|
||||
@@ -4,11 +4,12 @@ LABEL org.opencontainers.image.source https://github.com/khoj-ai/khoj
|
||||
|
||||
# Install System Dependencies
|
||||
RUN apt update -y && \
|
||||
apt -y install python3-pip python3-pyqt6 git
|
||||
apt -y install python3-pip git
|
||||
|
||||
# Install Python Dependencies
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install git+https://github.com/khoj-ai/khoj.git
|
||||
# Install Application
|
||||
COPY . .
|
||||
RUN sed -i 's/dynamic = \["version"\]/version = "0.0.0"/' pyproject.toml && \
|
||||
pip install --no-cache-dir .
|
||||
|
||||
# Run the Application
|
||||
# There are more arguments required for the application to run,
|
||||
|
||||
70
README.md
70
README.md
@@ -47,10 +47,10 @@
|
||||
- [Miscellaneous](#Miscellaneous-1)
|
||||
- [Development](#Development)
|
||||
- [Visualize Codebase](#visualize-codebase)
|
||||
- [Create Release](#create-khoj-release)
|
||||
- [Setup](#Setup)
|
||||
- [Using Pip](#Using-Pip)
|
||||
- [Using Docker](#Using-Docker)
|
||||
- [Using Conda](#Using-Conda)
|
||||
- [Validate](#Validate)
|
||||
- [Credits](#Credits)
|
||||
|
||||
@@ -169,9 +169,9 @@ The optional steps below allow using Khoj from within an existing application li
|
||||
- **Khoj via Emacs**
|
||||
- Run `M-x khoj <user-query>`
|
||||
- **Khoj via Web**
|
||||
- Open <http://localhost:8000/> directly
|
||||
- Open <http://localhost:42110/> directly
|
||||
- **Khoj via API**
|
||||
- See the Khoj FastAPI [Swagger Docs](http://localhost:8000/docs), [ReDocs](http://localhost:8000/redocs)
|
||||
- See the Khoj FastAPI [Swagger Docs](http://localhost:42110/docs), [ReDocs](http://localhost:42110/redocs)
|
||||
|
||||
<details><summary>Query Filters</summary>
|
||||
|
||||
@@ -207,7 +207,7 @@ Use structured query syntax to filter the natural language search results
|
||||
- [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
|
||||
|
||||
#### Use
|
||||
1. Open [/chat](http://localhost:8000/chat)[^2]
|
||||
1. Open [/chat](http://localhost:42110/chat)[^2]
|
||||
2. Type your queries and see response by Khoj from your notes
|
||||
|
||||
#### Demo
|
||||
@@ -256,7 +256,7 @@ pip install --upgrade --pre khoj-assistant
|
||||
- **Refer**: [Issue with Fix](https://github.com/khoj-ai/khoj/issues/82#issuecomment-1241890946) for more details
|
||||
|
||||
#### Search starts giving wonky results
|
||||
- **Fix**: Open [/api/update?force=true](http://localhost:8000/api/update?force=true)[^2] in browser to regenerate index from scratch
|
||||
- **Fix**: Open [/api/update?force=true](http://localhost:42110/api/update?force=true)[^2] in browser to regenerate index from scratch
|
||||
- **Note**: *This is a fix for when you percieve the search results have degraded. Not if you think they've always given wonky results*
|
||||
|
||||
#### Khoj in Docker errors out with \"Killed\" in error message
|
||||
@@ -270,7 +270,7 @@ pip install --upgrade --pre khoj-assistant
|
||||
### Access Khoj on Mobile
|
||||
1. [Setup Khoj](#Setup) on your personal server. This can be any always-on machine, i.e an old computer, RaspberryPi(?) etc
|
||||
2. [Install](https://tailscale.com/kb/installation/) [Tailscale](tailscale.com/) on your personal server and phone
|
||||
3. Open the Khoj web interface of the server from your phone browser.<br /> It should be `http://tailscale-ip-of-server:8000` or `http://name-of-server:8000` if you've setup [MagicDNS](https://tailscale.com/kb/1081/magicdns/)
|
||||
3. Open the Khoj web interface of the server from your phone browser.<br /> It should be `http://tailscale-ip-of-server:42110` or `http://name-of-server:42110` if you've setup [MagicDNS](https://tailscale.com/kb/1081/magicdns/)
|
||||
4. Click the [Add to Homescreen](https://developer.mozilla.org/en-US/docs/Web/Progressive_web_apps/Add_to_home_screen) button
|
||||
5. Enjoy exploring your notes, documents and images from your phone!
|
||||
|
||||
@@ -311,7 +311,7 @@ pip install --upgrade --pre khoj-assistant
|
||||
model_directory: "~/.khoj/search/asymmetric/"
|
||||
```
|
||||
|
||||
2. Regenerate your content index. For example, by opening [\<khoj-url\>/api/update?t=force](http://localhost:8000/api/update?t=force)
|
||||
2. Regenerate your content index. For example, by opening [\<khoj-url\>/api/update?force=true](http://localhost:42110/api/update?force=true)
|
||||
|
||||
### Bootstrap Khoj Search for Offline Usage later
|
||||
|
||||
@@ -330,7 +330,7 @@ pip install --upgrade --pre khoj-assistant
|
||||
### Set your OpenAI API key in Khoj
|
||||
If you want, Khoj can be configured to use OpenAI for search and chat.<br />
|
||||
Add your OpenAI API to Khoj by using either of the two options below:
|
||||
- Open your [Khoj settings](http://localhost:8000/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:8000/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.
|
||||
- Open your [Khoj settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.
|
||||
- Set `openai-api-key` field under `processor.conversation` section in your `khoj.yml`[^1] to your [OpenAI API key](https://beta.openai.com/account/api-keys) and restart khoj:
|
||||
```diff
|
||||
processor:
|
||||
@@ -344,11 +344,11 @@ Add your OpenAI API to Khoj by using either of the two options below:
|
||||
**Warning**: *This will enable Khoj to send your query and note(s) to OpenAI for processing*
|
||||
|
||||
### GPT API
|
||||
- The [chat](http://localhost:8000/api/chat), [answer](http://localhost:8000/api/beta/answer) and [search](http://localhost:8000/api/beta/search) API endpoints use [OpenAI API](https://openai.com/api/)
|
||||
- The [chat](http://localhost:42110/api/chat), [answer](http://localhost:42110/api/beta/answer) and [search](http://localhost:42110/api/beta/search) API endpoints use [OpenAI API](https://openai.com/api/)
|
||||
- They are disabled by default
|
||||
- To use them:
|
||||
1. [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
|
||||
2. Interact with them from the [Khoj Swagger docs](http://locahost:8000/docs)[^2]
|
||||
2. Interact with them from the [Khoj Swagger docs](http://locahost:42110/docs)[^2]
|
||||
|
||||
### Index Github Repository for Search, Chat
|
||||
The Khoj Github plugin can index issues, commit messages and markdown, org-mode and PDF files from any repositories you have access to. This allows you to chat or search with these repositories. Get answers, resolve issues or just explore a repo with the help of your AI personal assistant.
|
||||
@@ -388,6 +388,19 @@ Note: *Khoj will ignore code files in the repository for now as the default AI m
|
||||
|
||||

|
||||
|
||||
### Create Khoj Release
|
||||
Follow the steps below to [release](https://github.com/debanjum/khoj/releases/) Khoj. This will create a stable release of Khoj on [Pypi](https://pypi.org/project/khoj-assistant/), [Melpa](https://stable.melpa.org/#%252Fkhoj) and [Obsidian](https://obsidian.md/plugins?id%253Dkhoj). It will also create desktop apps of Khoj and attach them to the latest release.
|
||||
|
||||
1. Create and tag release commit by running the bump_version script. The release commit sets version number in required metadata files.
|
||||
```shell
|
||||
./scripts/bump_version.sh -c "<release_version>"
|
||||
```
|
||||
2. Push commit and then the tag to trigger the release workflow to create Release with auto generated release notes.
|
||||
```shell
|
||||
git push origin master # push release commit to khoj repository
|
||||
git push origin <release_version> # push release tag to khoj repository
|
||||
```
|
||||
3. [Optional] Update the Release Notes to highlight new features, fixes and updates
|
||||
### Setup
|
||||
#### Using Pip
|
||||
##### 1. Install
|
||||
@@ -409,7 +422,7 @@ pip install -e .[dev]
|
||||
khoj -vv
|
||||
```
|
||||
2. Configure Khoj
|
||||
- **Via the Settings UI**: Add files, directories to index the [Khoj settings](http://localhost:8000/config) UI once Khoj has started up. Once you've saved all your settings, click `Configure`.
|
||||
- **Via the Settings UI**: Add files, directories to index the [Khoj settings](http://localhost:42110/config) UI once Khoj has started up. Once you've saved all your settings, click `Configure`.
|
||||
- **Manually**:
|
||||
- Copy the `config/khoj_sample.yml` to `~/.khoj/khoj.yml`
|
||||
- Set `input-files` or `input-filter` in each relevant `content-type` section of `~/.khoj/khoj.yml`
|
||||
@@ -445,39 +458,6 @@ docker-compose up -d
|
||||
docker-compose build --pull
|
||||
```
|
||||
|
||||
#### Using Conda
|
||||
##### 1. Install Dependencies
|
||||
- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)
|
||||
|
||||
##### 2. Install Khoj
|
||||
```shell
|
||||
git clone https://github.com/khoj-ai/khoj && cd khoj
|
||||
conda env create -f config/environment.yml
|
||||
conda activate khoj
|
||||
python3 -m pip install pyqt6 # As conda does not support pyqt6 yet
|
||||
```
|
||||
|
||||
##### 3. Configure
|
||||
- Copy the `config/khoj_sample.yml` to `~/.khoj/khoj.yml`
|
||||
- Set `input-files` or `input-filter` in each relevant `content-type` section of `~/.khoj/khoj.yml`
|
||||
- Set `input-directories` field in `image` `content-type` section
|
||||
- Delete `content-type`, `processor` sub-sections irrelevant for your use-case
|
||||
|
||||
##### 4. Run
|
||||
```shell
|
||||
python3 -m src.khoj.main -vv
|
||||
```
|
||||
Load ML model, generate embeddings and expose API to query notes, images, documents etc specified in config YAML
|
||||
|
||||
##### 5. Upgrade
|
||||
```shell
|
||||
cd khoj
|
||||
git pull origin master
|
||||
conda deactivate khoj
|
||||
conda env update -f config/environment.yml
|
||||
conda activate khoj
|
||||
```
|
||||
|
||||
### Validate
|
||||
#### Before Make Changes
|
||||
1. Install Git Hooks for Validation
|
||||
@@ -522,4 +502,4 @@ conda activate khoj
|
||||
|
||||
[^1]: Default Khoj config file @ `~/.khoj/khoj.yml`
|
||||
|
||||
[^2]: Default Khoj url @ http://localhost:8000
|
||||
[^2]: Default Khoj url @ http://localhost:42110
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
name: khoj
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8.*
|
||||
- numpy=1.22.4
|
||||
- pytorch=1.13.1
|
||||
- torchvision=0.14.1
|
||||
- transformers=4.21.0
|
||||
- sentence-transformers=2.1.0
|
||||
- fastapi=0.77.1
|
||||
- uvicorn=0.17.6
|
||||
- pyyaml=6.0
|
||||
- pytest=7.1.2
|
||||
- pillow=9.3.0
|
||||
- openai=0.20.0
|
||||
- pydantic=1.9.1
|
||||
- jinja2=3.1.2
|
||||
- aiofiles=0.8.0
|
||||
- huggingface_hub=0.8.1
|
||||
- dateparser=1.1.1
|
||||
- schedule=1.1.0
|
||||
@@ -1,116 +0,0 @@
|
||||
name: khoj
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- aiofiles=0.8.0=pyhd8ed1ab_0
|
||||
- asgiref=3.4.1=pyhd8ed1ab_0
|
||||
- attrs=21.2.0=pyhd8ed1ab_0
|
||||
- brotlipy=0.7.0=py39h5161555_1001
|
||||
- ca-certificates=2022.6.15=h4653dfc_0
|
||||
- certifi=2022.6.15=py39h2804cbe_0
|
||||
- cffi=1.14.6=py39hda8b47f_0
|
||||
- chardet=4.0.0=py39h2804cbe_1
|
||||
- charset-normalizer=2.0.0=pyhd8ed1ab_0
|
||||
- click=8.0.1=py39h2804cbe_0
|
||||
- colorama=0.4.4=pyh9f0ad1d_0
|
||||
- cryptography=3.4.7=py39h73257c9_0
|
||||
- dataclasses=0.8=pyhc8e2a94_3
|
||||
- dateparser=1.1.1=pyhd8ed1ab_0
|
||||
- et_xmlfile=1.0.1=py_1001
|
||||
- fastapi=0.68.2=pyhd8ed1ab_0
|
||||
- filelock=3.0.12=pyh9f0ad1d_0
|
||||
- freetype=2.10.4=h17b34a0_1
|
||||
- future=0.18.2=py39h2804cbe_3
|
||||
- h11=0.12.0=pyhd8ed1ab_0
|
||||
- huggingface_hub=0.2.1=pyhd8ed1ab_0
|
||||
- idna=3.1=pyhd3deb0d_0
|
||||
- importlib-metadata=4.6.4=py39h2804cbe_0
|
||||
- importlib_metadata=4.6.4=hd8ed1ab_0
|
||||
- iniconfig=1.1.1=pyh9f0ad1d_0
|
||||
- jbig=2.1=h3422bc3_2003
|
||||
- jinja2=3.0.3=pyhd8ed1ab_0
|
||||
- joblib=1.0.1=pyhd8ed1ab_0
|
||||
- jpeg=9d=h27ca646_0
|
||||
- lcms2=2.12=had6a04f_0
|
||||
- lerc=2.2.1=h9f76cd9_0
|
||||
- libblas=3.9.0=11_osxarm64_openblas
|
||||
- libcblas=3.9.0=11_osxarm64_openblas
|
||||
- libcxx=12.0.1=h168391b_0
|
||||
- libdeflate=1.7=h27ca646_5
|
||||
- libffi=3.3=h9f76cd9_2
|
||||
- libgfortran=5.0.0.dev0=11_0_1_hf114ba7_23
|
||||
- libgfortran5=11.0.1.dev0=hf114ba7_23
|
||||
- liblapack=3.9.0=11_osxarm64_openblas
|
||||
- libopenblas=0.3.17=openmp_h5dd58f0_1
|
||||
- libpng=1.6.37=hf7e6567_2
|
||||
- libprotobuf=3.16.0=hccf11d3_0
|
||||
- libtiff=4.3.0=hc6122e1_1
|
||||
- libwebp-base=1.2.1=h3422bc3_0
|
||||
- llvm-openmp=12.0.1=hf3c4609_1
|
||||
- lz4-c=1.9.3=hbdafb3b_1
|
||||
- markupsafe=2.0.1=py39h5161555_1
|
||||
- more-itertools=8.8.0=pyhd8ed1ab_0
|
||||
- ncurses=6.2=h9aa5885_4
|
||||
- ninja=1.10.2=h4d860bb_0
|
||||
- nltk=3.6.2=pyhd8ed1ab_0
|
||||
- numpy=1.21.4=py39h1f3b974_0
|
||||
- olefile=0.46=pyh9f0ad1d_1
|
||||
- openai=0.11.4=py39h2804cbe_0
|
||||
- openjpeg=2.4.0=h062765e_1
|
||||
- openpyxl=3.0.9=pyhd8ed1ab_0
|
||||
- openssl=1.1.1q=ha287fd2_0
|
||||
- packaging=21.0=pyhd8ed1ab_0
|
||||
- pandas=1.3.4=py39h7f752ed_1
|
||||
- pandas-stubs=1.2.0.38=py39h2804cbe_0
|
||||
- pillow=8.3.2=py39ha74c66e_0
|
||||
- pip=21.2.4=pyhd8ed1ab_0
|
||||
- pluggy=0.13.1=py39h2804cbe_4
|
||||
- py=1.10.0=pyhd3deb0d_0
|
||||
- pycparser=2.20=pyh9f0ad1d_2
|
||||
- pydantic=1.8.2=py39h5161555_2
|
||||
- pyopenssl=20.0.1=pyhd8ed1ab_0
|
||||
- pyparsing=2.4.7=pyh9f0ad1d_0
|
||||
- pysocks=1.7.1=py39h2804cbe_3
|
||||
- pytest=6.2.5=py39h2804cbe_1
|
||||
- python=3.9.7=h54d631c_3_cpython
|
||||
- python-dateutil=2.8.2=pyhd8ed1ab_0
|
||||
- python-tzdata=2022.1=pyhd8ed1ab_0
|
||||
- python_abi=3.9=2_cp39
|
||||
- pytorch=1.9.0=cpu_py39he8fdc14_2
|
||||
- pytorch-cpu=1.9.0=cpu_py39hd610c6a_2
|
||||
- pytz=2021.3=pyhd8ed1ab_0
|
||||
- pytz-deprecation-shim=0.1.0.post0=py39h2804cbe_2
|
||||
- pyyaml=5.4.1=py39h5161555_1
|
||||
- readline=8.1=hedafd6a_0
|
||||
- regex=2021.8.21=py39h5161555_0
|
||||
- requests=2.26.0=pyhd8ed1ab_0
|
||||
- sacremoses=0.0.43=pyh9f0ad1d_0
|
||||
- scikit-learn=0.24.2=py39hef7049f_1
|
||||
- scipy=1.7.0=py39h5060c3b_0
|
||||
- sentence-transformers=2.1.0=pyhd8ed1ab_0
|
||||
- sentencepiece=0.1.95=py39h4d2d688_1
|
||||
- setuptools=57.4.0=py39h2804cbe_0
|
||||
- six=1.16.0=pyh6c4a22f_0
|
||||
- sleef=3.5.1=h27ca646_1
|
||||
- sqlite=3.36.0=h72a2b83_0
|
||||
- starlette=0.14.2=pyhd8ed1ab_0
|
||||
- threadpoolctl=2.2.0=pyh8a188c0_0
|
||||
- tk=8.6.11=he1e0b03_0
|
||||
- tokenizers=0.10.3=py39hab32027_1
|
||||
- toml=0.10.2=pyhd8ed1ab_0
|
||||
- torchvision=0.10.1=py39h0a40b5a_0_cpu
|
||||
- tqdm=4.62.1=pyhd8ed1ab_0
|
||||
- transformers=4.14.1=pyhd8ed1ab_0
|
||||
- typing-extensions=3.10.0.0=hd8ed1ab_0
|
||||
- typing_extensions=3.10.0.0=pyha770c72_0
|
||||
- tzdata=2021a=he74cb21_1
|
||||
- tzlocal=4.2=py39h2804cbe_1
|
||||
- urllib3=1.26.6=pyhd8ed1ab_0
|
||||
- uvicorn=0.16.0=py39h2804cbe_0
|
||||
- wheel=0.37.0=pyhd8ed1ab_1
|
||||
- xz=5.2.5=h642e427_1
|
||||
- yaml=0.2.5=h642e427_0
|
||||
- zipp=3.5.0=pyhd8ed1ab_0
|
||||
- zlib=1.2.11=h31e879b_1009
|
||||
- zstd=1.5.0=h861e0a7_0
|
||||
prefix: /opt/homebrew/Caskroom/miniforge/base/envs/khoj
|
||||
@@ -7,7 +7,7 @@ services:
|
||||
# If changing the remote port (right hand side),
|
||||
# change the port in the args in the build section,
|
||||
# as well as the port in the command section to match
|
||||
- "8000:8000"
|
||||
- "42110:42110"
|
||||
working_dir: /app
|
||||
volumes:
|
||||
- .:/app
|
||||
@@ -25,4 +25,4 @@ services:
|
||||
- ./tests/data/embeddings/:/data/embeddings/
|
||||
- ./tests/data/models/:/data/models/
|
||||
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
|
||||
command: --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv
|
||||
command: --host="0.0.0.0" --port=42110 -c=config/khoj_docker.yml -vv
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "khoj",
|
||||
"name": "Khoj",
|
||||
"version": "0.8.2",
|
||||
"version": "0.9.0",
|
||||
"minAppVersion": "0.15.0",
|
||||
"description": "An AI Personal Assistant for your Digital Brain",
|
||||
"author": "Debanjum Singh Solanky",
|
||||
|
||||
@@ -46,7 +46,7 @@ dependencies = [
|
||||
"tenacity >= 8.2.2",
|
||||
"pillow == 9.3.0",
|
||||
"pydantic >= 1.10.10",
|
||||
"pyqt6 == 6.3.1",
|
||||
"pyside6 >= 6.5.1",
|
||||
"pyyaml == 6.0",
|
||||
"rich >= 13.3.1",
|
||||
"schedule == 1.1.0",
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||
;; Description: An AI personal assistant for your digital brain
|
||||
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
|
||||
;; Version: 0.8.2
|
||||
;; Version: 0.9.0
|
||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
||||
;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
;; Khoj Static Configuration
|
||||
;; -------------------------
|
||||
|
||||
(defcustom khoj-server-url "http://localhost:8000"
|
||||
(defcustom khoj-server-url "http://localhost:42110"
|
||||
"Location of Khoj API server."
|
||||
:group 'khoj
|
||||
:type 'string)
|
||||
@@ -221,6 +221,11 @@ for example), set this to the full interpreter path."
|
||||
:type '(repeat string)
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-chat-model nil
|
||||
"Specify chat model to use for chat with khoj."
|
||||
:type 'string
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-openai-api-key nil
|
||||
"OpenAI API key used to configure chat on khoj server."
|
||||
:type 'string
|
||||
@@ -368,7 +373,8 @@ CONFIG is json obtained from Khoj config API."
|
||||
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
|
||||
(default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
|
||||
(default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
|
||||
(default-model (or (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config))) "text-davinci-003"))
|
||||
(chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'conversation (alist-get 'processor default-config)))))
|
||||
(default-model (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config))))
|
||||
(config (or current-config default-config)))
|
||||
|
||||
;; Configure content types
|
||||
@@ -423,6 +429,7 @@ CONFIG is json obtained from Khoj config API."
|
||||
(message "khoj.el: Chat not configured yet.")
|
||||
(setq config (delq (assoc 'processor config) config))
|
||||
(cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||
(chat-model . ,chat-model)
|
||||
(model . ,default-model)
|
||||
(openai-api-key . ,khoj-openai-api-key)))))
|
||||
config))
|
||||
@@ -432,6 +439,7 @@ CONFIG is json obtained from Khoj config API."
|
||||
(let ((new-processor-type (alist-get 'processor config)))
|
||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||
(chat-model . ,chat-model)
|
||||
(model . ,default-model)
|
||||
(openai-api-key . ,khoj-openai-api-key)))
|
||||
new-processor-type)
|
||||
@@ -439,14 +447,15 @@ CONFIG is json obtained from Khoj config API."
|
||||
(cl-pushnew `(processor . ,new-processor-type) config)))
|
||||
|
||||
;; Else if khoj is not configured with specified openai api key
|
||||
((not (equal (alist-get 'openai-api-key (alist-get 'conversation (alist-get 'processor config))) khoj-openai-api-key))
|
||||
((not (and (equal (alist-get 'openai-api-key (alist-get 'conversation (alist-get 'processor config))) khoj-openai-api-key)
|
||||
(equal (alist-get 'chat-model (alist-get 'conversation (alist-get 'processor config))) khoj-chat-model)))
|
||||
(message "khoj.el: Chat configuration has gone stale.")
|
||||
(let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
|
||||
(model-name (khoj--get-directory-from-config config '(processor conversation model)))
|
||||
(new-processor-type (alist-get 'processor config)))
|
||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
|
||||
(model . ,model-name)
|
||||
(model . ,default-model)
|
||||
(chat-model . ,khoj-chat-model)
|
||||
(openai-api-key . ,khoj-openai-api-key)))
|
||||
new-processor-type)
|
||||
(setq config (delq (assoc 'processor config) config))
|
||||
@@ -595,7 +604,7 @@ CONFIG is json obtained from Khoj config API."
|
||||
(file-extension (file-name-extension buffer-name)))
|
||||
(cond
|
||||
((and (member 'org enabled-content-types) (equal file-extension "org")) "org")
|
||||
((and (member 'org enabled-content-types) (equal file-extension "pdf")) "pdf")
|
||||
((and (member 'pdf enabled-content-types) (equal file-extension "pdf")) "pdf")
|
||||
((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown")
|
||||
(t khoj-default-content-type))))
|
||||
|
||||
@@ -609,13 +618,13 @@ CONFIG is json obtained from Khoj config API."
|
||||
;; POST provided config to khoj server
|
||||
(let ((url-request-method "POST")
|
||||
(url-request-extra-headers '(("Content-Type" . "application/json")))
|
||||
(url-request-data (json-encode-alist config))
|
||||
(url-request-data (encode-coding-string (json-encode-alist config) 'utf-8))
|
||||
(config-url (format "%s/api/config/data" khoj-server-url)))
|
||||
(with-current-buffer (url-retrieve-synchronously config-url)
|
||||
(buffer-string)))
|
||||
;; Update index on khoj server after configuration update
|
||||
(let ((khoj--server-ready? nil))
|
||||
(url-retrieve (format "%s/api/update?t=org&client=emacs" khoj-server-url) #'identity)))
|
||||
(url-retrieve (format "%s/api/update?client=emacs" khoj-server-url) #'identity)))
|
||||
|
||||
(defun khoj--get-enabled-content-types ()
|
||||
"Get content types enabled for search from API."
|
||||
@@ -1023,7 +1032,8 @@ Paragraph only starts at first text after blank line."
|
||||
(let* ((force-update (if (member "--force-update" args) "true" "false"))
|
||||
;; set content type to: specified > last used > based on current buffer > default type
|
||||
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||
(update-url (format "%s/api/update?t=%s&force=%s&client=emacs" khoj-server-url content-type force-update))
|
||||
(type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
|
||||
(update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
|
||||
(url-request-method "GET"))
|
||||
(progn
|
||||
(setq khoj--content-type content-type)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "khoj",
|
||||
"name": "Khoj",
|
||||
"version": "0.8.2",
|
||||
"version": "0.9.0",
|
||||
"minAppVersion": "0.15.0",
|
||||
"description": "An AI Personal Assistant for your Digital Brain",
|
||||
"author": "Debanjum Singh Solanky",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "Khoj",
|
||||
"version": "0.8.2",
|
||||
"version": "0.9.0",
|
||||
"description": "An AI Personal Assistant for your Digital Brain",
|
||||
"main": "src/main.js",
|
||||
"scripts": {
|
||||
|
||||
@@ -75,7 +75,7 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
|
||||
this.rerank = true
|
||||
// Set input element to contents of active markdown file
|
||||
// truncate to first 8,000 characters to avoid hitting query size limits
|
||||
this.inputEl.value = await this.app.vault.read(file).then(file_str => file_str.slice(0, 8000));
|
||||
this.inputEl.value = await this.app.vault.read(file).then(file_str => file_str.slice(0, 42110));
|
||||
// Trigger search to get and render similar notes from khoj backend
|
||||
this.inputEl.dispatchEvent(new Event('input'));
|
||||
this.rerank = false
|
||||
|
||||
@@ -11,7 +11,7 @@ export interface KhojSetting {
|
||||
|
||||
export const DEFAULT_SETTINGS: KhojSetting = {
|
||||
resultsCount: 6,
|
||||
khojUrl: 'http://127.0.0.1:8000',
|
||||
khojUrl: 'http://127.0.0.1:42110',
|
||||
connectedToBackend: false,
|
||||
autoConfigure: true,
|
||||
openaiApiKey: '',
|
||||
|
||||
@@ -12,5 +12,6 @@
|
||||
"0.7.1": "0.15.0",
|
||||
"0.8.0": "0.15.0",
|
||||
"0.8.1": "0.15.0",
|
||||
"0.8.2": "0.15.0"
|
||||
"0.8.2": "0.15.0",
|
||||
"0.9.0": "0.15.0"
|
||||
}
|
||||
|
||||
@@ -20,9 +20,15 @@ from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
from khoj.utils.config import (
|
||||
ContentIndex,
|
||||
SearchType,
|
||||
SearchModels,
|
||||
ProcessorConfigModel,
|
||||
ConversationProcessorConfigModel,
|
||||
)
|
||||
from khoj.utils.helpers import LRU, resolve_absolute_path, merge_dicts
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, SearchConfig, ContentConfig
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from khoj.search_filter.word_filter import WordFilter
|
||||
from khoj.search_filter.file_filter import FileFilter
|
||||
@@ -31,29 +37,61 @@ from khoj.search_filter.file_filter import FileFilter
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def configure_server(args, required=False):
|
||||
if args.config is None:
|
||||
if required:
|
||||
logger.error(
|
||||
f"Exiting as Khoj is not configured.\nConfigure it via http://localhost:8000/config or by editing {state.config_file}."
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Khoj is not configured.\nConfigure it via http://localhost:8000/config, plugins or by editing {state.config_file}."
|
||||
)
|
||||
return
|
||||
else:
|
||||
state.config = args.config
|
||||
def initialize_server(config: Optional[FullConfig], regenerate: bool, required=False):
|
||||
if config is None and required:
|
||||
logger.error(
|
||||
f"🚨 Exiting as Khoj is not configured.\nConfigure it via http://localhost:42110/config or by editing {state.config_file}."
|
||||
)
|
||||
sys.exit(1)
|
||||
elif config is None:
|
||||
logger.warning(
|
||||
f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}."
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
configure_server(config, regenerate)
|
||||
except Exception as e:
|
||||
logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True)
|
||||
|
||||
|
||||
def configure_server(config: FullConfig, regenerate: bool, search_type: Optional[SearchType] = None):
|
||||
# Update Config
|
||||
state.config = config
|
||||
|
||||
# Initialize Processor from Config
|
||||
state.processor_config = configure_processor(args.config.processor)
|
||||
try:
|
||||
state.config_lock.acquire()
|
||||
state.processor_config = configure_processor(state.config.processor)
|
||||
except Exception as e:
|
||||
logger.error(f"🚨 Failed to configure processor")
|
||||
raise e
|
||||
finally:
|
||||
state.config_lock.release()
|
||||
|
||||
# Initialize the search type and model from Config
|
||||
state.search_index_lock.acquire()
|
||||
state.SearchType = configure_search_types(state.config)
|
||||
state.model = configure_search(state.model, state.config, args.regenerate)
|
||||
state.search_index_lock.release()
|
||||
# Initialize Search Models from Config
|
||||
try:
|
||||
state.config_lock.acquire()
|
||||
state.SearchType = configure_search_types(state.config)
|
||||
state.search_models = configure_search(state.search_models, state.config.search_type)
|
||||
except Exception as e:
|
||||
logger.error(f"🚨 Failed to configure search models")
|
||||
raise e
|
||||
finally:
|
||||
state.config_lock.release()
|
||||
|
||||
# Initialize Content from Config
|
||||
if state.search_models:
|
||||
try:
|
||||
state.config_lock.acquire()
|
||||
state.content_index = configure_content(
|
||||
state.content_index, state.config.content_type, state.search_models, regenerate, search_type
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"🚨 Failed to index content")
|
||||
raise e
|
||||
finally:
|
||||
state.config_lock.release()
|
||||
|
||||
|
||||
def configure_routes(app):
|
||||
@@ -72,10 +110,16 @@ if not state.demo:
|
||||
|
||||
@schedule.repeat(schedule.every(61).minutes)
|
||||
def update_search_index():
|
||||
state.search_index_lock.acquire()
|
||||
state.model = configure_search(state.model, state.config, regenerate=False)
|
||||
state.search_index_lock.release()
|
||||
logger.info("📬 Search index updated via Scheduler")
|
||||
try:
|
||||
state.config_lock.acquire()
|
||||
state.content_index = configure_content(
|
||||
state.content_index, state.config.content_type, state.search_models, regenerate=False
|
||||
)
|
||||
logger.info("📬 Content index updated via Scheduler")
|
||||
except Exception as e:
|
||||
logger.error(f"🚨 Error updating content index via Scheduler: {e}")
|
||||
finally:
|
||||
state.config_lock.release()
|
||||
|
||||
|
||||
def configure_search_types(config: FullConfig):
|
||||
@@ -90,111 +134,134 @@ def configure_search_types(config: FullConfig):
|
||||
return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))
|
||||
|
||||
|
||||
def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: Optional[state.SearchType] = None):
|
||||
if config is None or config.content_type is None or config.search_type is None:
|
||||
logger.warning("🚨 No Content or Search type is configured.")
|
||||
return
|
||||
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
||||
# Run Validation Checks
|
||||
if search_config is None:
|
||||
logger.warning("🚨 No Search configuration available.")
|
||||
return None
|
||||
if search_models is None:
|
||||
search_models = SearchModels()
|
||||
|
||||
if model is None:
|
||||
model = SearchModels()
|
||||
# Initialize Search Models
|
||||
if search_config.asymmetric:
|
||||
logger.info("🔍 📜 Setting up text search model")
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
|
||||
if search_config.image:
|
||||
logger.info("🔍 🌄 Setting up image search model")
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
return search_models
|
||||
|
||||
|
||||
def configure_content(
|
||||
content_index: Optional[ContentIndex],
|
||||
content_config: Optional[ContentConfig],
|
||||
search_models: SearchModels,
|
||||
regenerate: bool,
|
||||
t: Optional[state.SearchType] = None,
|
||||
) -> Optional[ContentIndex]:
|
||||
# Run Validation Checks
|
||||
if content_config is None:
|
||||
logger.warning("🚨 No Content configuration available.")
|
||||
return None
|
||||
if content_index is None:
|
||||
content_index = ContentIndex()
|
||||
|
||||
try:
|
||||
# Initialize Org Notes Search
|
||||
if (t == state.SearchType.Org or t == None) and config.content_type.org and config.search_type.asymmetric:
|
||||
if (t == state.SearchType.Org or t == None) and content_config.org and search_models.text_search:
|
||||
logger.info("🦄 Setting up search for orgmode notes")
|
||||
# Extract Entries, Generate Notes Embeddings
|
||||
model.org_search = text_search.setup(
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl,
|
||||
config.content_type.org,
|
||||
search_config=config.search_type.asymmetric,
|
||||
content_config.org,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize Markdown Search
|
||||
if (
|
||||
(t == state.SearchType.Markdown or t == None)
|
||||
and config.content_type.markdown
|
||||
and config.search_type.asymmetric
|
||||
):
|
||||
if (t == state.SearchType.Markdown or t == None) and content_config.markdown and search_models.text_search:
|
||||
logger.info("💎 Setting up search for markdown notes")
|
||||
# Extract Entries, Generate Markdown Embeddings
|
||||
model.markdown_search = text_search.setup(
|
||||
content_index.markdown = text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
config.content_type.markdown,
|
||||
search_config=config.search_type.asymmetric,
|
||||
content_config.markdown,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize PDF Search
|
||||
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf and config.search_type.asymmetric:
|
||||
if (t == state.SearchType.Pdf or t == None) and content_config.pdf and search_models.text_search:
|
||||
logger.info("🖨️ Setting up search for pdf")
|
||||
# Extract Entries, Generate PDF Embeddings
|
||||
model.pdf_search = text_search.setup(
|
||||
content_index.pdf = text_search.setup(
|
||||
PdfToJsonl,
|
||||
config.content_type.pdf,
|
||||
search_config=config.search_type.asymmetric,
|
||||
content_config.pdf,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize Image Search
|
||||
if (t == state.SearchType.Image or t == None) and config.content_type.image and config.search_type.image:
|
||||
if (t == state.SearchType.Image or t == None) and content_config.image and search_models.image_search:
|
||||
logger.info("🌄 Setting up search for images")
|
||||
# Extract Entries, Generate Image Embeddings
|
||||
model.image_search = image_search.setup(
|
||||
config.content_type.image, search_config=config.search_type.image, regenerate=regenerate
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=regenerate
|
||||
)
|
||||
|
||||
if (t == state.SearchType.Github or t == None) and config.content_type.github and config.search_type.asymmetric:
|
||||
if (t == state.SearchType.Github or t == None) and content_config.github and search_models.text_search:
|
||||
logger.info("🐙 Setting up search for github")
|
||||
# Extract Entries, Generate Github Embeddings
|
||||
model.github_search = text_search.setup(
|
||||
content_index.github = text_search.setup(
|
||||
GithubToJsonl,
|
||||
config.content_type.github,
|
||||
search_config=config.search_type.asymmetric,
|
||||
content_config.github,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize External Plugin Search
|
||||
if (t == None or t in state.SearchType) and config.content_type.plugins:
|
||||
if (t == None or t in state.SearchType) and content_config.plugins and search_models.text_search:
|
||||
logger.info("🔌 Setting up search for plugins")
|
||||
model.plugin_search = {}
|
||||
for plugin_type, plugin_config in config.content_type.plugins.items():
|
||||
model.plugin_search[plugin_type] = text_search.setup(
|
||||
content_index.plugins = {}
|
||||
for plugin_type, plugin_config in content_config.plugins.items():
|
||||
content_index.plugins[plugin_type] = text_search.setup(
|
||||
JsonlToJsonl,
|
||||
plugin_config,
|
||||
search_config=config.search_type.asymmetric,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize Notion Search
|
||||
if (t == None or t in state.SearchType) and config.content_type.notion:
|
||||
if (t == None or t in state.SearchType) and content_config.notion and search_models.text_search:
|
||||
logger.info("🔌 Setting up search for notion")
|
||||
model.notion_search = text_search.setup(
|
||||
content_index.notion = text_search.setup(
|
||||
NotionToJsonl,
|
||||
config.content_type.notion,
|
||||
search_config=config.search_type.asymmetric,
|
||||
content_config.notion,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("🚨 Failed to setup search")
|
||||
logger.error(f"🚨 Failed to setup search: {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
# Invalidate Query Cache
|
||||
state.query_cache = LRU()
|
||||
|
||||
return model
|
||||
return content_index
|
||||
|
||||
|
||||
def configure_processor(processor_config: ProcessorConfig):
|
||||
def configure_processor(processor_config: Optional[ProcessorConfig]):
|
||||
if not processor_config:
|
||||
return
|
||||
logger.warning("🚨 No Processor configuration available.")
|
||||
return None
|
||||
|
||||
processor = ProcessorConfigModel()
|
||||
|
||||
|
||||
@@ -2,11 +2,24 @@
|
||||
import webbrowser
|
||||
|
||||
# External Packages
|
||||
from PyQt6 import QtGui, QtWidgets
|
||||
from PyQt6.QtCore import Qt
|
||||
from PySide6 import QtGui, QtWidgets
|
||||
from PySide6.QtCore import Qt
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import constants
|
||||
from PySide6.QtCore import QThread
|
||||
|
||||
|
||||
class ServerThread(QThread):
|
||||
def __init__(self, start_server_func):
|
||||
super(ServerThread, self).__init__()
|
||||
self.start_server_func = start_server_func
|
||||
|
||||
def __del__(self):
|
||||
self.wait()
|
||||
|
||||
def run(self):
|
||||
self.start_server_func()
|
||||
|
||||
|
||||
class MainWindow(QtWidgets.QMainWindow):
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import webbrowser
|
||||
|
||||
# External Packages
|
||||
from PyQt6 import QtGui, QtWidgets
|
||||
from PySide6 import QtGui, QtWidgets
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import constants, state
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" shape-rendering="geometricPrecision" text-rendering="geometricPrecision" image-rendering="optimizeQuality" fill-rule="evenodd" clip-rule="evenodd" viewBox="0 0 512 512"><path fill-rule="nonzero" d="M256 0c70.69 0 134.7 28.66 181.02 74.98C483.34 121.31 512 185.31 512 256c0 70.69-28.66 134.7-74.98 181.02C390.7 483.34 326.69 512 256 512c-70.69 0-134.69-28.66-181.02-74.98C28.66 390.7 0 326.69 0 256c0-70.69 28.66-134.69 74.98-181.02C121.31 28.66 185.31 0 256 0zm-21.49 301.51v-2.03c.16-13.46 1.48-24.12 4.07-32.05 2.54-7.92 6.19-14.37 10.97-19.25 4.77-4.92 10.51-9.39 17.22-13.46 4.31-2.74 8.22-5.78 11.68-9.18 3.45-3.36 6.19-7.27 8.23-11.69 2.02-4.37 3.04-9.24 3.04-14.62 0-6.4-1.52-11.94-4.57-16.66-3-4.68-7.06-8.28-12.04-10.87-5.03-2.54-10.61-3.81-16.76-3.81-5.53 0-10.81 1.11-15.89 3.45-5.03 2.29-9.25 5.89-12.55 10.77-3.3 4.87-5.23 11.12-5.74 18.74h-32.91c.51-12.95 3.81-23.92 9.85-32.91 6.1-8.99 14.13-15.8 24.08-20.42 10.01-4.62 21.08-6.9 33.16-6.9 13.31 0 24.89 2.43 34.84 7.41 9.96 4.93 17.73 11.83 23.27 20.67 5.48 8.84 8.28 19.1 8.28 30.88 0 8.08-1.27 15.34-3.81 21.79-2.54 6.45-6.1 12.24-10.77 17.27-4.68 5.08-10.21 9.54-16.71 13.41-6.15 3.86-11.12 7.82-14.88 11.93-3.81 4.11-6.56 8.99-8.28 14.58-1.73 5.63-2.69 12.59-2.84 20.92v2.03h-30.94zm16.36 65.82c-5.94-.04-11.02-2.13-15.29-6.35-4.26-4.21-6.35-9.34-6.35-15.33 0-5.89 2.09-10.97 6.35-15.19 4.27-4.21 9.35-6.35 15.29-6.35 5.84 0 10.92 2.14 15.18 6.35 4.32 4.22 6.45 9.3 6.45 15.19 0 3.96-1.01 7.62-2.99 10.87-1.98 3.3-4.57 5.94-7.82 7.87-3.25 1.93-6.86 2.9-10.82 2.94zM417.71 94.29C376.33 52.92 319.15 27.32 256 27.32c-63.15 0-120.32 25.6-161.71 66.97C52.92 135.68 27.32 192.85 27.32 256c0 63.15 25.6 120.33 66.97 161.71 41.39 41.37 98.56 66.97 161.71 66.97 63.15 0 120.33-25.6 161.71-66.97 41.37-41.38 66.97-98.56 66.97-161.71 0-63.15-25.6-120.32-66.97-161.71z"/></svg>
|
||||
|
After Width: | Height: | Size: 1.8 KiB |
@@ -85,6 +85,21 @@ img.khoj-logo {
|
||||
justify-self: center;
|
||||
}
|
||||
|
||||
a.khoj-banner {
|
||||
color: black;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
p.khoj-banner {
|
||||
font-size: medium;
|
||||
margin: 0;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
p#khoj-banner {
|
||||
display: inline;
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 600px) {
|
||||
div.khoj-header {
|
||||
display: grid;
|
||||
|
||||
@@ -51,6 +51,10 @@
|
||||
body.khoj-configure {
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
div.section {
|
||||
padding: 12px;
|
||||
}
|
||||
}
|
||||
|
||||
img.khoj-logo {
|
||||
@@ -69,6 +73,11 @@
|
||||
display: grid;
|
||||
justify-self: center;
|
||||
}
|
||||
|
||||
div.instructions {
|
||||
font-size: large;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
margin: 0;
|
||||
padding: 0 0 16px 0;
|
||||
@@ -162,6 +171,11 @@
|
||||
max-width: 16px;
|
||||
}
|
||||
|
||||
div.finalize-actions {
|
||||
grid-auto-flow: column;
|
||||
grid-gap: 24px;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 600px) {
|
||||
.section-cards {
|
||||
grid-template-columns: 1fr;
|
||||
|
||||
@@ -166,20 +166,20 @@
|
||||
}
|
||||
</script>
|
||||
<body>
|
||||
<div id="khoj-banner-container" class="khoj-banner-container">
|
||||
{% if demo %}
|
||||
<!-- Banner linking to https://khoj.dev -->
|
||||
<a class="khoj-banner" href="https://khoj.dev" target="_blank">
|
||||
<p id="khoj-banner" class="khoj-banner">
|
||||
Enroll in Khoj cloud to get your own assistant
|
||||
</p>
|
||||
</a>
|
||||
<input type="text" id="khoj-banner-email" placeholder="email" class="khoj-banner-email"></input>
|
||||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||
{% endif %}
|
||||
</div>
|
||||
<!--Add Header Logo and Nav Pane-->
|
||||
<div class="khoj-header">
|
||||
{% if demo %}
|
||||
<!-- Banner linking to https://khoj.dev -->
|
||||
<div class="khoj-banner-container">
|
||||
<a class="khoj-banner" href="https://khoj.dev" target="_blank">
|
||||
<p id="khoj-banner" class="khoj-banner">
|
||||
Enroll in Khoj cloud to get your own Github assistant
|
||||
</p>
|
||||
</a>
|
||||
<input type="text" id="khoj-banner-email" placeholder="email" class="khoj-banner-email"></input>
|
||||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if demo %}
|
||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways.svg" alt="Khoj"></img>
|
||||
@@ -351,7 +351,7 @@
|
||||
@media only screen and (max-width: 600px) {
|
||||
body {
|
||||
grid-template-columns: 1fr;
|
||||
grid-template-rows: auto minmax(80px, 100%) auto;
|
||||
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
||||
}
|
||||
body > * {
|
||||
grid-column: 1;
|
||||
@@ -364,11 +364,14 @@
|
||||
a.khoj-banner {
|
||||
display: block;
|
||||
}
|
||||
p.khoj-banner {
|
||||
padding: 0;
|
||||
}
|
||||
}
|
||||
@media only screen and (min-width: 600px) {
|
||||
body {
|
||||
grid-template-columns: auto min(70vw, 100%) auto;
|
||||
grid-template-rows: auto minmax(80px, 100%) auto;
|
||||
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
||||
}
|
||||
body > * {
|
||||
grid-column: 2;
|
||||
@@ -395,19 +398,10 @@
|
||||
}
|
||||
}
|
||||
|
||||
a.khoj-banner {
|
||||
color: black;
|
||||
}
|
||||
|
||||
a.khoj-logo {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
p.khoj-banner {
|
||||
margin: 0;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
button#khoj-banner-submit,
|
||||
input#khoj-banner-email {
|
||||
padding: 10px;
|
||||
@@ -420,17 +414,17 @@
|
||||
input#khoj-banner-email:hover {
|
||||
box-shadow: 0 0 11px #aaa;
|
||||
}
|
||||
|
||||
p#khoj-banner {
|
||||
display: inline;
|
||||
}
|
||||
|
||||
a.khoj-banner {
|
||||
color: black;
|
||||
text-decoration: none;
|
||||
div.khoj-banner-container-hidden {
|
||||
margin: 0px;
|
||||
padding: 0px;
|
||||
}
|
||||
</style>
|
||||
<script>
|
||||
if ("{{demo}}" === "False") {
|
||||
document.getElementById("khoj-banner-container").classList.remove("khoj-banner-container");
|
||||
document.getElementById("khoj-banner-container").classList.add("khoj-banner-container-hidden");
|
||||
}
|
||||
|
||||
var khojBannerSubmit = document.getElementById("khoj-banner-submit");
|
||||
|
||||
khojBannerSubmit?.addEventListener("click", function(event) {
|
||||
|
||||
@@ -11,7 +11,11 @@
|
||||
<h3 class="card-title">
|
||||
Github
|
||||
{% if current_config.content_type.github %}
|
||||
<img id="configured-icon-github" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% if current_model_state.github == False %}
|
||||
<img id="misconfigured-icon-github" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="Embeddings have not been generated yet for this content type. Either the configuration is invalid, or you just need to click Configure.">
|
||||
{% else %}
|
||||
<img id="configured-icon-github" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</h3>
|
||||
</div>
|
||||
@@ -42,7 +46,11 @@
|
||||
<h3 class="card-title">
|
||||
Notion
|
||||
{% if current_config.content_type.notion %}
|
||||
<img id="configured-icon-notion" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% if current_model_state.notion == False %}
|
||||
<img id="misconfigured-icon-notion" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="Embeddings have not been generated yet for this content type. Either the configuration is invalid, or you just need to click Configure.">
|
||||
{% else %}
|
||||
<img id="configured-icon-notion" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</h3>
|
||||
</div>
|
||||
@@ -73,7 +81,11 @@
|
||||
<h3 class="card-title">
|
||||
Markdown
|
||||
{% if current_config.content_type.markdown %}
|
||||
<img id="configured-icon-markdown" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% if current_model_state.markdown == False%}
|
||||
<img id="misconfigured-icon-markdown" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="Embeddings have not been generated yet for this content type. Either the configuration is invalid, or you just need to click Configure.">
|
||||
{% else %}
|
||||
<img id="configured-icon-markdown" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</h3>
|
||||
</div>
|
||||
@@ -104,7 +116,11 @@
|
||||
<h3 class="card-title">
|
||||
Org
|
||||
{% if current_config.content_type.org %}
|
||||
<img id="configured-icon-org" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% if current_model_state.org == False %}
|
||||
<img id="misconfigured-icon-org" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="Embeddings have not been generated yet for this content type. Either the configuration is invalid, or you just need to click Configure.">
|
||||
{% else %}
|
||||
<img id="configured-icon-org" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</h3>
|
||||
</div>
|
||||
@@ -135,7 +151,11 @@
|
||||
<h3 class="card-title">
|
||||
PDF
|
||||
{% if current_config.content_type.pdf %}
|
||||
<img id="configured-icon-pdf" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% if current_model_state.pdf == False %}
|
||||
<img id="misconfigured-icon-pdf" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="Embeddings have not been generated yet for this content type. Either the configuration is invalid, or you need to click Configure.">
|
||||
{% else %}
|
||||
<img id="configured-icon-pdf" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</h3>
|
||||
</div>
|
||||
@@ -171,8 +191,12 @@
|
||||
<h3 class="card-title">
|
||||
Chat
|
||||
{% if current_config.processor and current_config.processor.conversation %}
|
||||
<img id="configured-icon-conversation-processor" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% endif %}
|
||||
{% if current_model_state.conversation == False %}
|
||||
<img id="misconfigured-icon-conversation-processor" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="Embeddings have not been generated yet for this content type. Either the configuration is invalid, or you just need to click Configure.">
|
||||
{% else %}
|
||||
<img id="configured-icon-conversation-processor" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</h3>
|
||||
</div>
|
||||
<div class="card-description-row">
|
||||
@@ -204,6 +228,8 @@
|
||||
<input type="range" id="results-count-slider" name="results-count-slider" min="1" max="10" step="1" value="5">
|
||||
</div>
|
||||
<div id="status" style="display: none;"></div>
|
||||
</div>
|
||||
<div class="section finalize-actions">
|
||||
<button id="configure" type="submit" title="Update index with the latest changes">⚙️ Configure</button>
|
||||
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
||||
</div>
|
||||
@@ -225,7 +251,14 @@
|
||||
contentTypeClearButton.style.display = "none";
|
||||
|
||||
var configuredIcon = document.getElementById("configured-icon-" + content_type);
|
||||
configuredIcon.style.display = "none";
|
||||
if (configuredIcon) {
|
||||
configuredIcon.style.display = "none";
|
||||
}
|
||||
|
||||
var misconfiguredIcon = document.getElementById("misconfigured-icon-" + content_type);
|
||||
if (misconfiguredIcon) {
|
||||
misconfiguredIcon.style.display = "none";
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
@@ -246,7 +279,15 @@
|
||||
conversationClearButton.style.display = "none";
|
||||
|
||||
var configuredIcon = document.getElementById("configured-icon-conversation-processor");
|
||||
configuredIcon.style.display = "none";
|
||||
if (configuredIcon) {
|
||||
configuredIcon.style.display = "none";
|
||||
}
|
||||
|
||||
var misconfiguredIcon = document.getElementById("misconfigured-icon-conversation-processor");
|
||||
|
||||
if (misconfiguredIcon) {
|
||||
misconfiguredIcon.style.display = "none";
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
@@ -292,14 +333,14 @@
|
||||
if (data.detail != null) {
|
||||
throw new Error(data.detail);
|
||||
}
|
||||
document.getElementById("status").innerHTML = emoji + successText;
|
||||
document.getElementById("status").innerHTML = emoji + " " + successText;
|
||||
document.getElementById("status").style.display = "block";
|
||||
button.disabled = false;
|
||||
button.innerHTML = '✅ Done!';
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Error:', error);
|
||||
document.getElementById("status").innerHTML = emoji + errorText
|
||||
document.getElementById("status").innerHTML = emoji + " " + errorText
|
||||
document.getElementById("status").style.display = "block";
|
||||
button.disabled = false;
|
||||
button.innerHTML = '⚠️ Unsuccessful';
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
<h2 class="section-title">
|
||||
<img class="card-icon" src="/static/assets/icons/github.svg" alt="Github">
|
||||
<span class="card-title-text">Github</span>
|
||||
<div class="instructions">
|
||||
<a href="https://github.com/khoj-ai/khoj/wiki/Setup-Github-integration">ⓘ Help</a>
|
||||
</div>
|
||||
</h2>
|
||||
<form>
|
||||
<table>
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
<h2 class="section-title">
|
||||
<img class="card-icon" src="/static/assets/icons/notion.svg" alt="Notion">
|
||||
<span class="card-title-text">Notion</span>
|
||||
<div class="instructions">
|
||||
<a href="https://github.com/khoj-ai/khoj/wiki/Setup-Notion-Integration">ⓘ Help</a>
|
||||
</div>
|
||||
</h2>
|
||||
<form>
|
||||
<table>
|
||||
|
||||
@@ -216,19 +216,21 @@
|
||||
</script>
|
||||
|
||||
<body>
|
||||
{% if demo %}
|
||||
<!-- Banner linking to https://khoj.dev -->
|
||||
<div class="khoj-banner-container">
|
||||
<a class="khoj-banner" href="https://khoj.dev" target="_blank">
|
||||
<p id="khoj-banner" class="khoj-banner">
|
||||
Enroll in Khoj cloud to get your own assistant
|
||||
</p>
|
||||
</a>
|
||||
<input type="text" id="khoj-banner-email" placeholder="email" class="khoj-banner-email"></input>
|
||||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||
</div>
|
||||
{% endif %}
|
||||
<!--Add Header Logo and Nav Pane-->
|
||||
<div class="khoj-header">
|
||||
{% if demo %}
|
||||
<!-- Banner linking to https://khoj.dev -->
|
||||
<div class="khoj-banner-container">
|
||||
<a class="khoj-banner" href="https://khoj.dev" target="_blank">
|
||||
<p id="khoj-banner" class="khoj-banner">
|
||||
Enroll in Khoj cloud to get your own Github assistant
|
||||
</p>
|
||||
</a>
|
||||
<input type="text" id="khoj-banner-email" placeholder="email" class="khoj-banner-email"></input>
|
||||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||
</div>
|
||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways.svg" alt="Khoj"></img>
|
||||
</a>
|
||||
@@ -447,19 +449,10 @@
|
||||
}
|
||||
}
|
||||
|
||||
a.khoj-banner {
|
||||
color: black;
|
||||
}
|
||||
|
||||
a.khoj-logo {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
p.khoj-banner {
|
||||
margin: 0;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
button#khoj-banner-submit,
|
||||
input#khoj-banner-email {
|
||||
padding: 10px;
|
||||
@@ -473,14 +466,13 @@
|
||||
box-shadow: 0 0 11px #aaa;
|
||||
}
|
||||
|
||||
p#khoj-banner {
|
||||
display: inline;
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 600px) {
|
||||
a.khoj-banner {
|
||||
display: block;
|
||||
}
|
||||
p.khoj-banner {
|
||||
padding: 0;
|
||||
}
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
@@ -2,6 +2,12 @@
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
|
||||
if sys.stdout is None:
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
if sys.stderr is None:
|
||||
sys.stderr = open(os.devnull, "w")
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import warnings
|
||||
@@ -15,18 +21,13 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th
|
||||
# External Packages
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
from PyQt6 import QtWidgets
|
||||
from PyQt6.QtCore import QThread, QTimer
|
||||
from rich.logging import RichHandler
|
||||
import schedule
|
||||
|
||||
# Internal Packages
|
||||
from khoj.configure import configure_routes, configure_server
|
||||
from khoj.configure import configure_routes, initialize_server
|
||||
from khoj.utils import state
|
||||
from khoj.utils.cli import cli
|
||||
from khoj.interface.desktop.main_window import MainWindow
|
||||
from khoj.interface.desktop.system_tray import create_system_tray
|
||||
|
||||
|
||||
# Initialize the Application Server
|
||||
app = FastAPI()
|
||||
@@ -69,10 +70,15 @@ def run():
|
||||
poll_task_scheduler()
|
||||
|
||||
# Start Server
|
||||
configure_server(args, required=False)
|
||||
initialize_server(args.config, args.regenerate, required=False)
|
||||
configure_routes(app)
|
||||
start_server(app, host=args.host, port=args.port, socket=args.socket)
|
||||
else:
|
||||
from PySide6 import QtWidgets
|
||||
from PySide6.QtCore import QThread, QTimer
|
||||
from khoj.interface.desktop.main_window import MainWindow, ServerThread
|
||||
from khoj.interface.desktop.system_tray import create_system_tray
|
||||
|
||||
# Setup GUI
|
||||
gui = QtWidgets.QApplication([])
|
||||
main_window = MainWindow(args.host, args.port)
|
||||
@@ -87,9 +93,9 @@ def run():
|
||||
tray.show()
|
||||
|
||||
# Setup Server
|
||||
configure_server(args, required=False)
|
||||
initialize_server(args.config, args.regenerate, required=False)
|
||||
configure_routes(app)
|
||||
server = ServerThread(app, args.host, args.port, args.socket)
|
||||
server = ServerThread(start_server_func=lambda: start_server(app, host=args.host, port=args.port))
|
||||
|
||||
url = f"http://{args.host}:{args.port}"
|
||||
logger.info(f"🌗 Khoj is running at {url}")
|
||||
@@ -130,6 +136,8 @@ def run():
|
||||
|
||||
|
||||
def sigint_handler(*args):
|
||||
from PySide6 import QtWidgets
|
||||
|
||||
QtWidgets.QApplication.quit()
|
||||
|
||||
|
||||
@@ -158,21 +166,6 @@ def poll_task_scheduler():
|
||||
schedule.run_pending()
|
||||
|
||||
|
||||
class ServerThread(QThread):
|
||||
def __init__(self, app, host=None, port=None, socket=None):
|
||||
super(ServerThread, self).__init__()
|
||||
self.app = app
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.socket = socket
|
||||
|
||||
def __del__(self):
|
||||
self.wait()
|
||||
|
||||
def run(self):
|
||||
start_server(self.app, self.host, self.port, self.socket)
|
||||
|
||||
|
||||
def run_gui():
|
||||
sys.argv += ["--gui"]
|
||||
run()
|
||||
|
||||
@@ -13,9 +13,8 @@ from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.utils import state
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -38,7 +37,7 @@ class GithubToJsonl(TextToJsonl):
|
||||
else:
|
||||
return
|
||||
|
||||
def process(self, previous_entries=None):
|
||||
def process(self, previous_entries=[]):
|
||||
current_entries = []
|
||||
for repo in self.config.repos:
|
||||
current_entries += self.process_repo(repo)
|
||||
@@ -98,10 +97,7 @@ class GithubToJsonl(TextToJsonl):
|
||||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if self.config.compressed_jsonl.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
elif self.config.compressed_jsonl.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import List
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, timer
|
||||
from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import load_jsonl, compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class JsonlToJsonl(TextToJsonl):
|
||||
# Define Functions
|
||||
def process(self, previous_entries=None):
|
||||
def process(self, previous_entries=[]):
|
||||
# Extract required fields from config
|
||||
input_jsonl_files, input_jsonl_filter, output_file = (
|
||||
self.config.input_files,
|
||||
@@ -38,15 +38,9 @@ class JsonlToJsonl(TextToJsonl):
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries,
|
||||
previous_entries,
|
||||
key="compiled",
|
||||
logger=logger,
|
||||
)
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
with timer("Write entries to JSONL file", logger):
|
||||
# Process Each Entry from All Notes Files
|
||||
@@ -54,10 +48,7 @@ class JsonlToJsonl(TextToJsonl):
|
||||
jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ from typing import List
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry, TextContentConfig
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
self.config = config
|
||||
|
||||
# Define Functions
|
||||
def process(self, previous_entries=None):
|
||||
def process(self, previous_entries=[]):
|
||||
# Extract required fields from config
|
||||
markdown_files, markdown_file_filter, output_file = (
|
||||
self.config.input_files,
|
||||
@@ -51,12 +51,9 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
with timer("Write markdown entries to JSONL file", logger):
|
||||
# Process Each Entry from All Notes Files
|
||||
@@ -64,10 +61,7 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import requests
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
from enum import Enum
|
||||
@@ -78,7 +78,9 @@ class NotionToJsonl(TextToJsonl):
|
||||
NotionBlockType.DIVIDER.value,
|
||||
]
|
||||
|
||||
def process(self, previous_entries=None):
|
||||
self.body_params = {"page_size": 100}
|
||||
|
||||
def process(self, previous_entries=[]):
|
||||
current_entries = []
|
||||
|
||||
# Get all pages
|
||||
@@ -88,13 +90,13 @@ class NotionToJsonl(TextToJsonl):
|
||||
while True:
|
||||
result = self.session.post(
|
||||
"https://api.notion.com/v1/search",
|
||||
json={"page_size": 100},
|
||||
json=self.body_params,
|
||||
).json()
|
||||
responses.append(result)
|
||||
if result["has_more"] == False:
|
||||
break
|
||||
else:
|
||||
self.session.params = {"start_cursor": responses[-1]["next_cursor"]}
|
||||
self.body_params.update({"start_cursor": result["next_cursor"]})
|
||||
|
||||
for response in responses:
|
||||
with timer("Processing response", logger=logger):
|
||||
@@ -174,7 +176,8 @@ class NotionToJsonl(TextToJsonl):
|
||||
return f"\n<b>{heading}</b>\n"
|
||||
|
||||
def process_nested_children(self, children, raw_content, block_type=None):
|
||||
for child in children["results"]:
|
||||
results = children["results"] if children.get("results") else []
|
||||
for child in results:
|
||||
child_type = child.get("type")
|
||||
if child_type == None:
|
||||
continue
|
||||
@@ -199,7 +202,11 @@ class NotionToJsonl(TextToJsonl):
|
||||
return raw_text
|
||||
|
||||
def get_block_children(self, block_id):
|
||||
return self.session.get(f"https://api.notion.com/v1/blocks/{block_id}/children").json()
|
||||
try:
|
||||
return self.session.get(f"https://api.notion.com/v1/blocks/{block_id}/children").json()
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting children for block {block_id}: {e}")
|
||||
return {}
|
||||
|
||||
def get_page(self, page_id):
|
||||
return self.session.get(f"https://api.notion.com/v1/pages/{page_id}").json()
|
||||
@@ -215,19 +222,27 @@ class NotionToJsonl(TextToJsonl):
|
||||
logger.error(f"Error getting page {page_id}: {e}")
|
||||
return None, None
|
||||
properties = page["properties"]
|
||||
title_field = "Title" if "Title" in properties else "title"
|
||||
title_field = "title"
|
||||
if "Title" in properties:
|
||||
title_field = "Title"
|
||||
elif "Name" in properties:
|
||||
title_field = "Name"
|
||||
elif "Page" in properties:
|
||||
title_field = "Page"
|
||||
elif "Event" in properties:
|
||||
title_field = "Event"
|
||||
elif title_field not in properties:
|
||||
logger.error(f"Page {page_id} does not have a title field")
|
||||
return None, None
|
||||
title = page["properties"][title_field]["title"][0]["text"]["content"]
|
||||
return title, content
|
||||
|
||||
def update_entries_with_ids(self, current_entries, previous_entries):
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
with timer("Write Notion entries to JSONL file", logger):
|
||||
# Process Each Entry from all Notion entries
|
||||
@@ -235,9 +250,6 @@ class NotionToJsonl(TextToJsonl):
|
||||
jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if self.config.compressed_jsonl.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
elif self.config.compressed_jsonl.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
@@ -8,7 +8,7 @@ from typing import Iterable, List
|
||||
from khoj.processor.org_mode import orgnode
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry, TextContentConfig
|
||||
from khoj.utils import state
|
||||
|
||||
@@ -22,7 +22,7 @@ class OrgToJsonl(TextToJsonl):
|
||||
self.config = config
|
||||
|
||||
# Define Functions
|
||||
def process(self, previous_entries: List[Entry] = None):
|
||||
def process(self, previous_entries: List[Entry] = []):
|
||||
# Extract required fields from config
|
||||
org_files, org_file_filter, output_file = (
|
||||
self.config.input_files,
|
||||
@@ -51,9 +51,7 @@ class OrgToJsonl(TextToJsonl):
|
||||
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
with timer("Identify new or updated entries", logger):
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
@@ -64,10 +62,7 @@ class OrgToJsonl(TextToJsonl):
|
||||
jsonl_data = self.convert_org_entries_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
@@ -125,9 +120,13 @@ class OrgToJsonl(TextToJsonl):
|
||||
# Ignore title notes i.e notes with just headings and empty body
|
||||
continue
|
||||
|
||||
todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else ""
|
||||
# Prepend filename as top heading to entry
|
||||
filename = Path(entry_to_file_map[parsed_entry]).stem
|
||||
heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
|
||||
if parsed_entry.heading:
|
||||
heading = f"* {filename}\n** {todo_str}{parsed_entry.heading}."
|
||||
else:
|
||||
heading = f"* {filename}."
|
||||
|
||||
compiled = heading
|
||||
if state.verbose > 2:
|
||||
|
||||
@@ -10,7 +10,7 @@ from langchain.document_loaders import PyPDFLoader
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.jsonl import compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class PdfToJsonl(TextToJsonl):
|
||||
# Define Functions
|
||||
def process(self, previous_entries=None):
|
||||
def process(self, previous_entries=[]):
|
||||
# Extract required fields from config
|
||||
pdf_files, pdf_file_filter, output_file = (
|
||||
self.config.input_files,
|
||||
@@ -45,12 +45,9 @@ class PdfToJsonl(TextToJsonl):
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
with timer("Write PDF entries to JSONL file", logger):
|
||||
# Process Each Entry from All Notes Files
|
||||
@@ -58,10 +55,7 @@ class PdfToJsonl(TextToJsonl):
|
||||
jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ class TextToJsonl(ABC):
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
def process(self, previous_entries: List[Entry] = None) -> List[Tuple[int, Entry]]:
|
||||
def process(self, previous_entries: List[Entry] = []) -> List[Tuple[int, Entry]]:
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
@@ -78,16 +78,23 @@ class TextToJsonl(ABC):
|
||||
# All entries that exist in both current and previous sets are kept
|
||||
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
|
||||
|
||||
# load new entries in the order in which they are processed for a stable sort
|
||||
new_entries = [
|
||||
(current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash])
|
||||
for entry_hash in new_entry_hashes
|
||||
]
|
||||
new_entries_sorted = sorted(new_entries, key=lambda e: e[0])
|
||||
# Mark new entries with -1 id to flag for later embeddings generation
|
||||
new_entries = [(-1, hash_to_current_entries[entry_hash]) for entry_hash in new_entry_hashes]
|
||||
new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted]
|
||||
|
||||
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings
|
||||
existing_entries = [
|
||||
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
|
||||
for entry_hash in existing_entry_hashes
|
||||
]
|
||||
|
||||
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
|
||||
entries_with_ids = existing_entries_sorted + new_entries
|
||||
|
||||
entries_with_ids = existing_entries_sorted + new_entries_sorted
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
|
||||
@@ -5,20 +5,20 @@ import time
|
||||
import yaml
|
||||
import logging
|
||||
import json
|
||||
from typing import List, Optional, Union
|
||||
from typing import Iterable, List, Optional, Union
|
||||
|
||||
# External Packages
|
||||
from fastapi import APIRouter, HTTPException, Header, Request
|
||||
from sentence_transformers import util
|
||||
|
||||
# Internal Packages
|
||||
from khoj.configure import configure_processor, configure_search
|
||||
from khoj.configure import configure_processor, configure_server
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from khoj.search_filter.file_filter import FileFilter
|
||||
from khoj.search_filter.word_filter import WordFilter
|
||||
from khoj.utils.config import TextSearchModel
|
||||
from khoj.utils.helpers import log_telemetry, timer
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import (
|
||||
ContentConfig,
|
||||
FullConfig,
|
||||
@@ -34,7 +34,7 @@ from khoj.utils.state import SearchType
|
||||
from khoj.utils import state, constants
|
||||
from khoj.utils.yaml import save_config_to_file_updated_state
|
||||
from fastapi.responses import StreamingResponse, Response
|
||||
from khoj.routers.helpers import perform_chat_checks, generate_chat_response
|
||||
from khoj.routers.helpers import perform_chat_checks, generate_chat_response, update_telemetry_state
|
||||
from khoj.processor.conversation.gpt import extract_questions
|
||||
from fastapi.requests import Request
|
||||
|
||||
@@ -56,15 +56,44 @@ if not state.demo:
|
||||
return state.config
|
||||
|
||||
@api.post("/config/data")
|
||||
async def set_config_data(updated_config: FullConfig):
|
||||
async def set_config_data(
|
||||
request: Request,
|
||||
updated_config: FullConfig,
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
state.config = updated_config
|
||||
with open(state.config_file, "w") as outfile:
|
||||
yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile)
|
||||
outfile.close()
|
||||
|
||||
configuration_update_metadata = dict()
|
||||
|
||||
if state.config.content_type is not None:
|
||||
configuration_update_metadata["github"] = state.config.content_type.github is not None
|
||||
configuration_update_metadata["notion"] = state.config.content_type.notion is not None
|
||||
configuration_update_metadata["org"] = state.config.content_type.org is not None
|
||||
configuration_update_metadata["pdf"] = state.config.content_type.pdf is not None
|
||||
configuration_update_metadata["markdown"] = state.config.content_type.markdown is not None
|
||||
configuration_update_metadata["plugins"] = state.config.content_type.plugins is not None
|
||||
|
||||
if state.config.processor is not None:
|
||||
configuration_update_metadata["conversation_processor"] = state.config.processor.conversation is not None
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="set_config",
|
||||
client=client,
|
||||
metadata=configuration_update_metadata,
|
||||
)
|
||||
return state.config
|
||||
|
||||
@api.post("/config/data/content_type/github", status_code=200)
|
||||
async def set_content_config_github_data(updated_config: Union[GithubContentConfig, None]):
|
||||
async def set_content_config_github_data(
|
||||
request: Request,
|
||||
updated_config: Union[GithubContentConfig, None],
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
_initialize_config()
|
||||
|
||||
if not state.config.content_type:
|
||||
@@ -72,6 +101,14 @@ if not state.demo:
|
||||
else:
|
||||
state.config.content_type.github = updated_config
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="set_content_config",
|
||||
client=client,
|
||||
metadata={"content_type": "github"},
|
||||
)
|
||||
|
||||
try:
|
||||
save_config_to_file_updated_state()
|
||||
return {"status": "ok"}
|
||||
@@ -79,7 +116,11 @@ if not state.demo:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
@api.post("/config/data/content_type/notion", status_code=200)
|
||||
async def set_content_config_notion_data(updated_config: Union[NotionContentConfig, None]):
|
||||
async def set_content_config_notion_data(
|
||||
request: Request,
|
||||
updated_config: Union[NotionContentConfig, None],
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
_initialize_config()
|
||||
|
||||
if not state.config.content_type:
|
||||
@@ -87,6 +128,14 @@ if not state.demo:
|
||||
else:
|
||||
state.config.content_type.notion = updated_config
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="set_content_config",
|
||||
client=client,
|
||||
metadata={"content_type": "notion"},
|
||||
)
|
||||
|
||||
try:
|
||||
save_config_to_file_updated_state()
|
||||
return {"status": "ok"}
|
||||
@@ -94,25 +143,37 @@ if not state.demo:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
@api.post("/delete/config/data/content_type/{content_type}", status_code=200)
|
||||
async def remove_content_config_data(content_type: str):
|
||||
async def remove_content_config_data(
|
||||
request: Request,
|
||||
content_type: str,
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
if not state.config or not state.config.content_type:
|
||||
return {"status": "ok"}
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="delete_content_config",
|
||||
client=client,
|
||||
metadata={"content_type": content_type},
|
||||
)
|
||||
|
||||
if state.config.content_type:
|
||||
state.config.content_type[content_type] = None
|
||||
|
||||
if content_type == "github":
|
||||
state.model.github_search = None
|
||||
state.content_index.github = None
|
||||
elif content_type == "notion":
|
||||
state.model.notion_search = None
|
||||
state.content_index.notion = None
|
||||
elif content_type == "plugins":
|
||||
state.model.plugin_search = None
|
||||
state.content_index.plugins = None
|
||||
elif content_type == "pdf":
|
||||
state.model.pdf_search = None
|
||||
state.content_index.pdf = None
|
||||
elif content_type == "markdown":
|
||||
state.model.markdown_search = None
|
||||
state.content_index.markdown = None
|
||||
elif content_type == "org":
|
||||
state.model.org_search = None
|
||||
state.content_index.org = None
|
||||
|
||||
try:
|
||||
save_config_to_file_updated_state()
|
||||
@@ -121,12 +182,23 @@ if not state.demo:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
@api.post("/delete/config/data/processor/conversation", status_code=200)
|
||||
async def remove_processor_conversation_config_data():
|
||||
async def remove_processor_conversation_config_data(
|
||||
request: Request,
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
if not state.config or not state.config.processor or not state.config.processor.conversation:
|
||||
return {"status": "ok"}
|
||||
|
||||
state.config.processor.conversation = None
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="delete_processor_config",
|
||||
client=client,
|
||||
metadata={"processor_type": "conversation"},
|
||||
)
|
||||
|
||||
try:
|
||||
save_config_to_file_updated_state()
|
||||
return {"status": "ok"}
|
||||
@@ -134,7 +206,12 @@ if not state.demo:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
@api.post("/config/data/content_type/{content_type}", status_code=200)
|
||||
async def set_content_config_data(content_type: str, updated_config: Union[TextContentConfig, None]):
|
||||
async def set_content_config_data(
|
||||
request: Request,
|
||||
content_type: str,
|
||||
updated_config: Union[TextContentConfig, None],
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
_initialize_config()
|
||||
|
||||
if not state.config.content_type:
|
||||
@@ -142,6 +219,14 @@ if not state.demo:
|
||||
else:
|
||||
state.config.content_type[content_type] = updated_config
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="set_content_config",
|
||||
client=client,
|
||||
metadata={"content_type": content_type},
|
||||
)
|
||||
|
||||
try:
|
||||
save_config_to_file_updated_state()
|
||||
return {"status": "ok"}
|
||||
@@ -149,11 +234,24 @@ if not state.demo:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
@api.post("/config/data/processor/conversation", status_code=200)
|
||||
async def set_processor_conversation_config_data(updated_config: Union[ConversationProcessorConfig, None]):
|
||||
async def set_processor_conversation_config_data(
|
||||
request: Request,
|
||||
updated_config: Union[ConversationProcessorConfig, None],
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
_initialize_config()
|
||||
|
||||
state.config.processor = ProcessorConfig(conversation=updated_config)
|
||||
state.processor_config = configure_processor(state.config.processor)
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="set_content_config",
|
||||
client=client,
|
||||
metadata={"processor_type": "conversation"},
|
||||
)
|
||||
|
||||
try:
|
||||
save_config_to_file_updated_state()
|
||||
return {"status": "ok"}
|
||||
@@ -182,7 +280,7 @@ def get_config_types():
|
||||
for search_type in SearchType
|
||||
if (
|
||||
search_type.value in configured_content_types
|
||||
and getattr(state.model, f"{search_type.value}_search") is not None
|
||||
and getattr(state.content_index, search_type.value) is not None
|
||||
)
|
||||
or ("plugins" in configured_content_types and search_type.name in configured_content_types["plugins"])
|
||||
or search_type == SearchType.All
|
||||
@@ -210,7 +308,7 @@ async def search(
|
||||
if q is None or q == "":
|
||||
logger.warning(f"No query param (q) passed in API call to initiate search")
|
||||
return results
|
||||
if not state.model or not any(state.model.__dict__.values()):
|
||||
if not state.search_models or not any(state.search_models.__dict__.values()):
|
||||
logger.warning(f"No search models loaded. Configure a search model before initiating search")
|
||||
return results
|
||||
|
||||
@@ -234,7 +332,7 @@ async def search(
|
||||
encoded_asymmetric_query = None
|
||||
if t == SearchType.All or t != SearchType.Image:
|
||||
text_search_models: List[TextSearchModel] = [
|
||||
model for model in state.model.__dict__.values() if isinstance(model, TextSearchModel)
|
||||
model for model in state.search_models.__dict__.values() if isinstance(model, TextSearchModel)
|
||||
]
|
||||
if text_search_models:
|
||||
with timer("Encoding query took", logger=logger):
|
||||
@@ -247,13 +345,14 @@ async def search(
|
||||
)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
if (t == SearchType.Org or t == SearchType.All) and state.model.org_search:
|
||||
if (t == SearchType.Org or t == SearchType.All) and state.content_index.org and state.search_models.text_search:
|
||||
# query org-mode notes
|
||||
search_futures += [
|
||||
executor.submit(
|
||||
text_search.query,
|
||||
user_query,
|
||||
state.model.org_search,
|
||||
state.search_models.text_search,
|
||||
state.content_index.org,
|
||||
question_embedding=encoded_asymmetric_query,
|
||||
rank_results=r or False,
|
||||
score_threshold=score_threshold,
|
||||
@@ -261,13 +360,18 @@ async def search(
|
||||
)
|
||||
]
|
||||
|
||||
if (t == SearchType.Markdown or t == SearchType.All) and state.model.markdown_search:
|
||||
if (
|
||||
(t == SearchType.Markdown or t == SearchType.All)
|
||||
and state.content_index.markdown
|
||||
and state.search_models.text_search
|
||||
):
|
||||
# query markdown notes
|
||||
search_futures += [
|
||||
executor.submit(
|
||||
text_search.query,
|
||||
user_query,
|
||||
state.model.markdown_search,
|
||||
state.search_models.text_search,
|
||||
state.content_index.markdown,
|
||||
question_embedding=encoded_asymmetric_query,
|
||||
rank_results=r or False,
|
||||
score_threshold=score_threshold,
|
||||
@@ -275,13 +379,18 @@ async def search(
|
||||
)
|
||||
]
|
||||
|
||||
if (t == SearchType.Github or t == SearchType.All) and state.model.github_search:
|
||||
if (
|
||||
(t == SearchType.Github or t == SearchType.All)
|
||||
and state.content_index.github
|
||||
and state.search_models.text_search
|
||||
):
|
||||
# query github issues
|
||||
search_futures += [
|
||||
executor.submit(
|
||||
text_search.query,
|
||||
user_query,
|
||||
state.model.github_search,
|
||||
state.search_models.text_search,
|
||||
state.content_index.github,
|
||||
question_embedding=encoded_asymmetric_query,
|
||||
rank_results=r or False,
|
||||
score_threshold=score_threshold,
|
||||
@@ -289,13 +398,14 @@ async def search(
|
||||
)
|
||||
]
|
||||
|
||||
if (t == SearchType.Pdf or t == SearchType.All) and state.model.pdf_search:
|
||||
if (t == SearchType.Pdf or t == SearchType.All) and state.content_index.pdf and state.search_models.text_search:
|
||||
# query pdf files
|
||||
search_futures += [
|
||||
executor.submit(
|
||||
text_search.query,
|
||||
user_query,
|
||||
state.model.pdf_search,
|
||||
state.search_models.text_search,
|
||||
state.content_index.pdf,
|
||||
question_embedding=encoded_asymmetric_query,
|
||||
rank_results=r or False,
|
||||
score_threshold=score_threshold,
|
||||
@@ -303,26 +413,38 @@ async def search(
|
||||
)
|
||||
]
|
||||
|
||||
if (t == SearchType.Image) and state.model.image_search:
|
||||
if (t == SearchType.Image) and state.content_index.image and state.search_models.image_search:
|
||||
# query images
|
||||
search_futures += [
|
||||
executor.submit(
|
||||
image_search.query,
|
||||
user_query,
|
||||
results_count,
|
||||
state.model.image_search,
|
||||
state.search_models.image_search,
|
||||
state.content_index.image,
|
||||
score_threshold=score_threshold,
|
||||
)
|
||||
]
|
||||
|
||||
if (t == SearchType.All or t in SearchType) and state.model.plugin_search:
|
||||
if (
|
||||
(t == SearchType.All or t in SearchType)
|
||||
and state.content_index.plugins
|
||||
and state.search_models.plugin_search
|
||||
):
|
||||
# query specified plugin type
|
||||
# Get plugin content, search model for specified search type, or the first one if none specified
|
||||
plugin_search = state.search_models.plugin_search.get(t.value) or next(
|
||||
iter(state.search_models.plugin_search.values())
|
||||
)
|
||||
plugin_content = state.content_index.plugins.get(t.value) or next(
|
||||
iter(state.content_index.plugins.values())
|
||||
)
|
||||
search_futures += [
|
||||
executor.submit(
|
||||
text_search.query,
|
||||
user_query,
|
||||
# Get plugin search model for specified search type, or the first one if none specified
|
||||
state.model.plugin_search.get(t.value) or next(iter(state.model.plugin_search.values())),
|
||||
plugin_search,
|
||||
plugin_content,
|
||||
question_embedding=encoded_asymmetric_query,
|
||||
rank_results=r or False,
|
||||
score_threshold=score_threshold,
|
||||
@@ -330,13 +452,18 @@ async def search(
|
||||
)
|
||||
]
|
||||
|
||||
if (t == SearchType.Notion or t == SearchType.All) and state.model.notion_search:
|
||||
if (
|
||||
(t == SearchType.Notion or t == SearchType.All)
|
||||
and state.content_index.notion
|
||||
and state.search_models.text_search
|
||||
):
|
||||
# query notion pages
|
||||
search_futures += [
|
||||
executor.submit(
|
||||
text_search.query,
|
||||
user_query,
|
||||
state.model.notion_search,
|
||||
state.search_models.text_search,
|
||||
state.content_index.notion,
|
||||
question_embedding=encoded_asymmetric_query,
|
||||
rank_results=r or False,
|
||||
score_threshold=score_threshold,
|
||||
@@ -347,13 +474,13 @@ async def search(
|
||||
# Query across each requested content types in parallel
|
||||
with timer("Query took", logger):
|
||||
for search_future in concurrent.futures.as_completed(search_futures):
|
||||
if t == SearchType.Image:
|
||||
if t == SearchType.Image and state.content_index.image:
|
||||
hits = await search_future.result()
|
||||
output_directory = constants.web_directory / "images"
|
||||
# Collate results
|
||||
results += image_search.collate_results(
|
||||
hits,
|
||||
image_names=state.model.image_search.image_names,
|
||||
image_names=state.content_index.image.image_names,
|
||||
output_directory=output_directory,
|
||||
image_files_url="/static/images",
|
||||
count=results_count,
|
||||
@@ -369,20 +496,16 @@ async def search(
|
||||
# Cache results
|
||||
state.query_cache[query_cache_key] = results
|
||||
|
||||
user_state = {
|
||||
"client_host": request.client.host if request.client else "unknown",
|
||||
"user_agent": user_agent or "unknown",
|
||||
"referer": referer or "unknown",
|
||||
"host": host or "unknown",
|
||||
}
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="search",
|
||||
client=client,
|
||||
user_agent=user_agent,
|
||||
referer=referer,
|
||||
host=host,
|
||||
)
|
||||
|
||||
# Only log telemetry if query is new and not a continuation of previous query
|
||||
if state.previous_query is None or state.previous_query not in user_query:
|
||||
state.telemetry += [
|
||||
log_telemetry(
|
||||
telemetry_type="api", api="search", client=client, app_config=state.config.app, properties=user_state
|
||||
)
|
||||
]
|
||||
state.previous_query = user_query
|
||||
|
||||
end_time = time.time()
|
||||
@@ -401,42 +524,36 @@ def update(
|
||||
referer: Optional[str] = Header(None),
|
||||
host: Optional[str] = Header(None),
|
||||
):
|
||||
if not state.config:
|
||||
error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}."
|
||||
logger.warning(error_msg)
|
||||
raise HTTPException(status_code=500, detail=error_msg)
|
||||
try:
|
||||
state.search_index_lock.acquire()
|
||||
try:
|
||||
state.model = configure_search(state.model, state.config, regenerate=force or False, t=t)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
state.search_index_lock.release()
|
||||
except ValueError as e:
|
||||
logger.error(e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
configure_server(state.config, regenerate=force or False, search_type=t)
|
||||
except Exception as e:
|
||||
error_msg = f"🚨 Failed to update server via API: {e}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=error_msg)
|
||||
else:
|
||||
logger.info("📬 Search index updated via API")
|
||||
components = []
|
||||
if state.search_models:
|
||||
components.append("Search models")
|
||||
if state.content_index:
|
||||
components.append("Content index")
|
||||
if state.processor_config:
|
||||
components.append("Conversation processor")
|
||||
components_msg = ", ".join(components)
|
||||
logger.info(f"📬 {components_msg} updated via API")
|
||||
|
||||
try:
|
||||
if state.config and state.config.processor:
|
||||
state.processor_config = configure_processor(state.config.processor)
|
||||
except ValueError as e:
|
||||
logger.error(e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
else:
|
||||
logger.info("📬 Processor reconfigured via API")
|
||||
|
||||
user_state = {
|
||||
"client_host": request.client.host if request.client else None,
|
||||
"user_agent": user_agent or "unknown",
|
||||
"referer": referer or "unknown",
|
||||
"host": host or "unknown",
|
||||
}
|
||||
|
||||
state.telemetry += [
|
||||
log_telemetry(
|
||||
telemetry_type="api", api="update", client=client, app_config=state.config.app, properties=user_state
|
||||
)
|
||||
]
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="update",
|
||||
client=client,
|
||||
user_agent=user_agent,
|
||||
referer=referer,
|
||||
host=host,
|
||||
)
|
||||
|
||||
return {"status": "ok", "message": "khoj reloaded"}
|
||||
|
||||
@@ -454,18 +571,15 @@ def chat_history(
|
||||
# Load Conversation History
|
||||
meta_log = state.processor_config.conversation.meta_log
|
||||
|
||||
user_state = {
|
||||
"client_host": request.client.host if request.client else None,
|
||||
"user_agent": user_agent or "unknown",
|
||||
"referer": referer or "unknown",
|
||||
"host": host or "unknown",
|
||||
}
|
||||
|
||||
state.telemetry += [
|
||||
log_telemetry(
|
||||
telemetry_type="api", api="chat", client=client, app_config=state.config.app, properties=user_state
|
||||
)
|
||||
]
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="chat",
|
||||
client=client,
|
||||
user_agent=user_agent,
|
||||
referer=referer,
|
||||
host=host,
|
||||
)
|
||||
|
||||
return {"status": "ok", "response": meta_log.get("chat", [])}
|
||||
|
||||
@@ -509,18 +623,15 @@ async def chat(
|
||||
|
||||
response_obj = {"response": actual_response, "context": compiled_references}
|
||||
|
||||
user_state = {
|
||||
"client_host": request.client.host if request.client else None,
|
||||
"user_agent": user_agent or "unknown",
|
||||
"referer": referer or "unknown",
|
||||
"host": host or "unknown",
|
||||
}
|
||||
|
||||
state.telemetry += [
|
||||
log_telemetry(
|
||||
telemetry_type="api", api="chat", client=client, app_config=state.config.app, properties=user_state
|
||||
)
|
||||
]
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="chat",
|
||||
client=client,
|
||||
user_agent=user_agent,
|
||||
referer=referer,
|
||||
host=host,
|
||||
)
|
||||
|
||||
return Response(content=json.dumps(response_obj), media_type="application/json", status_code=200)
|
||||
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
from fastapi import HTTPException
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from functools import partial
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import HTTPException, Request
|
||||
|
||||
from khoj.utils import state
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.helpers import timer, log_telemetry
|
||||
from khoj.processor.conversation.gpt import converse
|
||||
from khoj.processor.conversation.utils import message_to_log, reciprocal_conversation_to_chatml
|
||||
|
||||
@@ -24,6 +25,33 @@ def perform_chat_checks():
|
||||
)
|
||||
|
||||
|
||||
def update_telemetry_state(
|
||||
request: Request,
|
||||
telemetry_type: str,
|
||||
api: str,
|
||||
client: Optional[str] = None,
|
||||
user_agent: Optional[str] = None,
|
||||
referer: Optional[str] = None,
|
||||
host: Optional[str] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
):
|
||||
user_state = {
|
||||
"client_host": request.client.host if request.client else None,
|
||||
"user_agent": user_agent or "unknown",
|
||||
"referer": referer or "unknown",
|
||||
"host": host or "unknown",
|
||||
}
|
||||
|
||||
if metadata:
|
||||
user_state.update(metadata)
|
||||
|
||||
state.telemetry += [
|
||||
log_telemetry(
|
||||
telemetry_type=telemetry_type, api=api, client=client, app_config=state.config.app, properties=user_state
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def generate_chat_response(
|
||||
q: str,
|
||||
meta_log: dict,
|
||||
|
||||
@@ -39,7 +39,44 @@ if not state.demo:
|
||||
processor=None,
|
||||
)
|
||||
current_config = state.config or json.loads(default_full_config.json())
|
||||
return templates.TemplateResponse("config.html", context={"request": request, "current_config": current_config})
|
||||
|
||||
successfully_configured = {
|
||||
"pdf": False,
|
||||
"markdown": False,
|
||||
"org": False,
|
||||
"image": False,
|
||||
"github": False,
|
||||
"notion": False,
|
||||
"conversation": False,
|
||||
}
|
||||
|
||||
if state.content_index:
|
||||
successfully_configured.update(
|
||||
{
|
||||
"pdf": state.content_index.pdf is not None,
|
||||
"markdown": state.content_index.markdown is not None,
|
||||
"org": state.content_index.org is not None,
|
||||
"image": state.content_index.image is not None,
|
||||
"github": state.content_index.github is not None,
|
||||
"notion": state.content_index.notion is not None,
|
||||
}
|
||||
)
|
||||
|
||||
if state.processor_config:
|
||||
successfully_configured.update(
|
||||
{
|
||||
"conversation": state.processor_config.conversation is not None,
|
||||
}
|
||||
)
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"config.html",
|
||||
context={
|
||||
"request": request,
|
||||
"current_config": current_config,
|
||||
"current_model_state": successfully_configured,
|
||||
},
|
||||
)
|
||||
|
||||
@web_client.get("/config/content_type/github", response_class=HTMLResponse)
|
||||
def github_config_page(request: Request):
|
||||
|
||||
@@ -12,10 +12,12 @@ from sentence_transformers import SentenceTransformer, util
|
||||
from PIL import Image
|
||||
from tqdm import trange
|
||||
import torch
|
||||
from khoj.utils import state
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.helpers import get_absolute_path, get_from_dict, resolve_absolute_path, load_model, timer
|
||||
from khoj.utils.config import ImageSearchModel
|
||||
from khoj.utils.config import ImageContent, ImageSearchModel
|
||||
from khoj.utils.models import BaseEncoder
|
||||
from khoj.utils.rawconfig import ImageContentConfig, ImageSearchConfig, SearchResponse
|
||||
|
||||
|
||||
@@ -40,7 +42,7 @@ def initialize_model(search_config: ImageSearchConfig):
|
||||
model_type=search_config.encoder_type or SentenceTransformer,
|
||||
)
|
||||
|
||||
return encoder
|
||||
return ImageSearchModel(encoder)
|
||||
|
||||
|
||||
def extract_entries(image_directories):
|
||||
@@ -143,7 +145,9 @@ def extract_metadata(image_name):
|
||||
return image_processed_metadata
|
||||
|
||||
|
||||
async def query(raw_query, count, model: ImageSearchModel, score_threshold: float = -math.inf):
|
||||
async def query(
|
||||
raw_query, count, search_model: ImageSearchModel, content: ImageContent, score_threshold: float = -math.inf
|
||||
):
|
||||
# Set query to image content if query is of form file:/path/to/file.png
|
||||
if raw_query.startswith("file:") and pathlib.Path(raw_query[5:]).is_file():
|
||||
query_imagepath = resolve_absolute_path(pathlib.Path(raw_query[5:]), strict=True)
|
||||
@@ -158,21 +162,21 @@ async def query(raw_query, count, model: ImageSearchModel, score_threshold: floa
|
||||
|
||||
# Now we encode the query (which can either be an image or a text string)
|
||||
with timer("Query Encode Time", logger):
|
||||
query_embedding = model.image_encoder.encode([query], convert_to_tensor=True, show_progress_bar=False)
|
||||
query_embedding = search_model.image_encoder.encode([query], convert_to_tensor=True, show_progress_bar=False)
|
||||
|
||||
# Compute top_k ranked images based on cosine-similarity b/w query and all image embeddings.
|
||||
with timer("Search Time", logger):
|
||||
image_hits = {
|
||||
result["corpus_id"]: {"image_score": result["score"], "score": result["score"]}
|
||||
for result in util.semantic_search(query_embedding, model.image_embeddings, top_k=count)[0]
|
||||
for result in util.semantic_search(query_embedding, content.image_embeddings, top_k=count)[0]
|
||||
}
|
||||
|
||||
# Compute top_k ranked images based on cosine-similarity b/w query and all image metadata embeddings.
|
||||
if model.image_metadata_embeddings:
|
||||
if content.image_metadata_embeddings:
|
||||
with timer("Metadata Search Time", logger):
|
||||
metadata_hits = {
|
||||
result["corpus_id"]: result["score"]
|
||||
for result in util.semantic_search(query_embedding, model.image_metadata_embeddings, top_k=count)[0]
|
||||
for result in util.semantic_search(query_embedding, content.image_metadata_embeddings, top_k=count)[0]
|
||||
}
|
||||
|
||||
# Sum metadata, image scores of the highest ranked images
|
||||
@@ -239,10 +243,7 @@ def collate_results(hits, image_names, output_directory, image_files_url, count=
|
||||
return results
|
||||
|
||||
|
||||
def setup(config: ImageContentConfig, search_config: ImageSearchConfig, regenerate: bool) -> ImageSearchModel:
|
||||
# Initialize Model
|
||||
encoder = initialize_model(search_config)
|
||||
|
||||
def setup(config: ImageContentConfig, encoder: BaseEncoder, regenerate: bool) -> ImageContent:
|
||||
# Extract Entries
|
||||
absolute_image_files, filtered_image_files = set(), set()
|
||||
if config.input_directories:
|
||||
@@ -268,4 +269,4 @@ def setup(config: ImageContentConfig, search_config: ImageSearchConfig, regenera
|
||||
use_xmp_metadata=config.use_xmp_metadata,
|
||||
)
|
||||
|
||||
return ImageSearchModel(all_image_files, image_embeddings, image_metadata_embeddings, encoder)
|
||||
return ImageContent(all_image_files, image_embeddings, image_metadata_embeddings)
|
||||
|
||||
@@ -13,7 +13,7 @@ from khoj.search_filter.base_filter import BaseFilter
|
||||
# Internal Packages
|
||||
from khoj.utils import state
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model, timer
|
||||
from khoj.utils.config import TextSearchModel
|
||||
from khoj.utils.config import TextContent, TextSearchModel
|
||||
from khoj.utils.models import BaseEncoder
|
||||
from khoj.utils.rawconfig import SearchResponse, TextSearchConfig, TextConfigBase, Entry
|
||||
from khoj.utils.jsonl import load_jsonl
|
||||
@@ -26,9 +26,6 @@ def initialize_model(search_config: TextSearchConfig):
|
||||
"Initialize model for semantic search on text"
|
||||
torch.set_num_threads(4)
|
||||
|
||||
# Number of entries we want to retrieve with the bi-encoder
|
||||
top_k = 15
|
||||
|
||||
# If model directory is configured
|
||||
if search_config.model_directory:
|
||||
# Convert model directory to absolute path
|
||||
@@ -52,7 +49,7 @@ def initialize_model(search_config: TextSearchConfig):
|
||||
device=f"{state.device}",
|
||||
)
|
||||
|
||||
return bi_encoder, cross_encoder, top_k
|
||||
return TextSearchModel(bi_encoder, cross_encoder)
|
||||
|
||||
|
||||
def extract_entries(jsonl_file) -> List[Entry]:
|
||||
@@ -61,60 +58,66 @@ def extract_entries(jsonl_file) -> List[Entry]:
|
||||
|
||||
|
||||
def compute_embeddings(
|
||||
entries_with_ids: List[Tuple[int, Entry]], bi_encoder: BaseEncoder, embeddings_file: Path, regenerate=False
|
||||
entries_with_ids: List[Tuple[int, Entry]],
|
||||
bi_encoder: BaseEncoder,
|
||||
embeddings_file: Path,
|
||||
regenerate=False,
|
||||
normalize=True,
|
||||
):
|
||||
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
|
||||
new_entries = []
|
||||
new_embeddings = torch.tensor([], device=state.device)
|
||||
existing_embeddings = torch.tensor([], device=state.device)
|
||||
create_index_msg = ""
|
||||
# Load pre-computed embeddings from file if exists and update them if required
|
||||
if embeddings_file.exists() and not regenerate:
|
||||
corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
|
||||
corpus_embeddings: torch.Tensor = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
|
||||
logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}")
|
||||
|
||||
# Encode any new entries in the corpus and update corpus embeddings
|
||||
new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
|
||||
if new_entries:
|
||||
logger.info(f"📩 Indexing {len(new_entries)} text entries.")
|
||||
new_embeddings = bi_encoder.encode(
|
||||
new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
|
||||
)
|
||||
existing_entry_ids = [id for id, _ in entries_with_ids if id != -1]
|
||||
if existing_entry_ids:
|
||||
existing_embeddings = torch.index_select(
|
||||
corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device)
|
||||
)
|
||||
else:
|
||||
existing_embeddings = torch.tensor([], device=state.device)
|
||||
corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
|
||||
# Else compute the corpus embeddings from scratch
|
||||
else:
|
||||
new_entries = [entry.compiled for _, entry in entries_with_ids]
|
||||
logger.info(f"📩 Indexing {len(new_entries)} text entries. Creating index from scratch.")
|
||||
corpus_embeddings = bi_encoder.encode(
|
||||
corpus_embeddings = torch.tensor([], device=state.device)
|
||||
create_index_msg = " Creating index from scratch."
|
||||
|
||||
# Encode any new entries in the corpus and update corpus embeddings
|
||||
new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
|
||||
if new_entries:
|
||||
logger.info(f"📩 Indexing {len(new_entries)} text entries.{create_index_msg}")
|
||||
new_embeddings = bi_encoder.encode(
|
||||
new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
|
||||
)
|
||||
|
||||
# Save regenerated or updated embeddings to file
|
||||
if new_entries:
|
||||
# Extract existing embeddings from previous corpus embeddings
|
||||
existing_entry_ids = [id for id, _ in entries_with_ids if id != -1]
|
||||
if existing_entry_ids:
|
||||
existing_embeddings = torch.index_select(
|
||||
corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device)
|
||||
)
|
||||
|
||||
# Set corpus embeddings to merger of existing and new embeddings
|
||||
corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
|
||||
if normalize:
|
||||
# Normalize embeddings for faster lookup via dot product when querying
|
||||
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
|
||||
torch.save(corpus_embeddings, embeddings_file)
|
||||
logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
|
||||
|
||||
# Save regenerated or updated embeddings to file
|
||||
torch.save(corpus_embeddings, embeddings_file)
|
||||
logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
|
||||
|
||||
return corpus_embeddings
|
||||
|
||||
|
||||
async def query(
|
||||
raw_query: str,
|
||||
model: TextSearchModel,
|
||||
search_model: TextSearchModel,
|
||||
content: TextContent,
|
||||
question_embedding: Union[torch.Tensor, None] = None,
|
||||
rank_results: bool = False,
|
||||
score_threshold: float = -math.inf,
|
||||
dedupe: bool = True,
|
||||
) -> Tuple[List[dict], List[Entry]]:
|
||||
"Search for entries that answer the query"
|
||||
query, entries, corpus_embeddings = raw_query, model.entries, model.corpus_embeddings
|
||||
query, entries, corpus_embeddings = raw_query, content.entries, content.corpus_embeddings
|
||||
|
||||
# Filter query, entries and embeddings before semantic search
|
||||
query, entries, corpus_embeddings = apply_filters(query, entries, corpus_embeddings, model.filters)
|
||||
query, entries, corpus_embeddings = apply_filters(query, entries, corpus_embeddings, content.filters)
|
||||
|
||||
# If no entries left after filtering, return empty results
|
||||
if entries is None or len(entries) == 0:
|
||||
@@ -127,18 +130,17 @@ async def query(
|
||||
# Encode the query using the bi-encoder
|
||||
if question_embedding is None:
|
||||
with timer("Query Encode Time", logger, state.device):
|
||||
question_embedding = model.bi_encoder.encode([query], convert_to_tensor=True, device=state.device)
|
||||
question_embedding = search_model.bi_encoder.encode([query], convert_to_tensor=True, device=state.device)
|
||||
question_embedding = util.normalize_embeddings(question_embedding)
|
||||
|
||||
# Find relevant entries for the query
|
||||
top_k = min(len(entries), search_model.top_k or 10) # top_k hits can't be more than the total entries in corpus
|
||||
with timer("Search Time", logger, state.device):
|
||||
hits = util.semantic_search(
|
||||
question_embedding, corpus_embeddings, top_k=model.top_k, score_function=util.dot_score
|
||||
)[0]
|
||||
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k, score_function=util.dot_score)[0]
|
||||
|
||||
# Score all retrieved entries using the cross-encoder
|
||||
if rank_results:
|
||||
hits = cross_encoder_score(model.cross_encoder, query, entries, hits)
|
||||
if rank_results and search_model.cross_encoder:
|
||||
hits = cross_encoder_score(search_model.cross_encoder, query, entries, hits)
|
||||
|
||||
# Filter results by score threshold
|
||||
hits = [hit for hit in hits if hit.get("cross-score", hit.get("score")) >= score_threshold]
|
||||
@@ -173,37 +175,34 @@ def collate_results(hits, entries: List[Entry], count=5) -> List[SearchResponse]
|
||||
def setup(
|
||||
text_to_jsonl: Type[TextToJsonl],
|
||||
config: TextConfigBase,
|
||||
search_config: TextSearchConfig,
|
||||
bi_encoder: BaseEncoder,
|
||||
regenerate: bool,
|
||||
filters: List[BaseFilter] = [],
|
||||
) -> TextSearchModel:
|
||||
# Initialize Model
|
||||
bi_encoder, cross_encoder, top_k = initialize_model(search_config)
|
||||
|
||||
normalize: bool = True,
|
||||
) -> TextContent:
|
||||
# Map notes in text files to (compressed) JSONL formatted file
|
||||
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
||||
previous_entries = (
|
||||
extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None
|
||||
)
|
||||
entries_with_indices = text_to_jsonl(config).process(previous_entries or [])
|
||||
previous_entries = []
|
||||
if config.compressed_jsonl.exists() and not regenerate:
|
||||
previous_entries = extract_entries(config.compressed_jsonl)
|
||||
entries_with_indices = text_to_jsonl(config).process(previous_entries)
|
||||
|
||||
# Extract Updated Entries
|
||||
entries = extract_entries(config.compressed_jsonl)
|
||||
if is_none_or_empty(entries):
|
||||
config_params = ", ".join([f"{key}={value}" for key, value in config.dict().items()])
|
||||
raise ValueError(f"No valid entries found in specified files: {config_params}")
|
||||
top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus
|
||||
|
||||
# Compute or Load Embeddings
|
||||
config.embeddings_file = resolve_absolute_path(config.embeddings_file)
|
||||
corpus_embeddings = compute_embeddings(
|
||||
entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate
|
||||
entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate, normalize=normalize
|
||||
)
|
||||
|
||||
for filter in filters:
|
||||
filter.load(entries, regenerate=regenerate)
|
||||
|
||||
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, filters, top_k)
|
||||
return TextContent(entries, corpus_embeddings, filters)
|
||||
|
||||
|
||||
def apply_filters(
|
||||
|
||||
@@ -5,7 +5,7 @@ from importlib.metadata import version
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
from khoj.utils.yaml import parse_config_from_file
|
||||
from khoj.utils.yaml import load_config_from_file, parse_config_from_file, save_config_to_file
|
||||
|
||||
|
||||
def cli(args=None):
|
||||
@@ -23,7 +23,7 @@ def cli(args=None):
|
||||
)
|
||||
parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0")
|
||||
parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1")
|
||||
parser.add_argument("--port", "-p", type=int, default=8000, help="Port of the server. Default: 8000")
|
||||
parser.add_argument("--port", "-p", type=int, default=42110, help="Port of the server. Default: 42110")
|
||||
parser.add_argument(
|
||||
"--socket",
|
||||
type=pathlib.Path,
|
||||
@@ -34,9 +34,10 @@ def cli(args=None):
|
||||
|
||||
args = parser.parse_args(args)
|
||||
|
||||
args.version_no = version("khoj-assistant")
|
||||
if args.version:
|
||||
# Show version of khoj installed and exit
|
||||
print(version("khoj-assistant"))
|
||||
print(args.version_no)
|
||||
exit(0)
|
||||
|
||||
# Normalize config_file path to absolute path
|
||||
@@ -45,6 +46,22 @@ def cli(args=None):
|
||||
if not args.config_file.exists():
|
||||
args.config = None
|
||||
else:
|
||||
args = migrate_config(args)
|
||||
args.config = parse_config_from_file(args.config_file)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def migrate_config(args):
|
||||
raw_config = load_config_from_file(args.config_file)
|
||||
|
||||
# Add version to khoj config schema
|
||||
if "version" not in raw_config:
|
||||
raw_config["version"] = args.version_no
|
||||
save_config_to_file(raw_config, args.config_file)
|
||||
|
||||
# regenerate khoj index on first start of this version
|
||||
# this should refresh index and apply index corruption fixes from #325
|
||||
args.regenerate = True
|
||||
|
||||
return args
|
||||
|
||||
@@ -3,7 +3,7 @@ from __future__ import annotations # to avoid quoting type hints
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Dict, List, Union
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
# External Packages
|
||||
import torch
|
||||
@@ -30,42 +30,48 @@ class ProcessorType(str, Enum):
|
||||
Conversation = "conversation"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextContent:
|
||||
entries: List[Entry]
|
||||
corpus_embeddings: torch.Tensor
|
||||
filters: List[BaseFilter]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageContent:
|
||||
image_names: List[str]
|
||||
image_embeddings: torch.Tensor
|
||||
image_metadata_embeddings: torch.Tensor
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextSearchModel:
|
||||
def __init__(
|
||||
self,
|
||||
entries: List[Entry],
|
||||
corpus_embeddings: torch.Tensor,
|
||||
bi_encoder: BaseEncoder,
|
||||
cross_encoder: CrossEncoder,
|
||||
filters: List[BaseFilter],
|
||||
top_k,
|
||||
):
|
||||
self.entries = entries
|
||||
self.corpus_embeddings = corpus_embeddings
|
||||
self.bi_encoder = bi_encoder
|
||||
self.cross_encoder = cross_encoder
|
||||
self.filters = filters
|
||||
self.top_k = top_k
|
||||
bi_encoder: BaseEncoder
|
||||
cross_encoder: Optional[CrossEncoder] = None
|
||||
top_k: Optional[int] = 15
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageSearchModel:
|
||||
def __init__(self, image_names, image_embeddings, image_metadata_embeddings, image_encoder: BaseEncoder):
|
||||
self.image_encoder = image_encoder
|
||||
self.image_names = image_names
|
||||
self.image_embeddings = image_embeddings
|
||||
self.image_metadata_embeddings = image_metadata_embeddings
|
||||
self.image_encoder = image_encoder
|
||||
image_encoder: BaseEncoder
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContentIndex:
|
||||
org: Optional[TextContent] = None
|
||||
markdown: Optional[TextContent] = None
|
||||
pdf: Optional[TextContent] = None
|
||||
github: Optional[TextContent] = None
|
||||
notion: Optional[TextContent] = None
|
||||
image: Optional[ImageContent] = None
|
||||
plugins: Optional[Dict[str, TextContent]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchModels:
|
||||
org_search: Union[TextSearchModel, None] = None
|
||||
markdown_search: Union[TextSearchModel, None] = None
|
||||
pdf_search: Union[TextSearchModel, None] = None
|
||||
image_search: Union[ImageSearchModel, None] = None
|
||||
github_search: Union[TextSearchModel, None] = None
|
||||
notion_search: Union[TextSearchModel, None] = None
|
||||
plugin_search: Union[Dict[str, TextSearchModel], None] = None
|
||||
text_search: Optional[TextSearchModel] = None
|
||||
image_search: Optional[ImageSearchModel] = None
|
||||
plugin_search: Optional[Dict[str, TextSearchModel]] = None
|
||||
|
||||
|
||||
class ConversationProcessorConfigModel:
|
||||
|
||||
@@ -20,7 +20,7 @@ from khoj.utils import constants
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# External Packages
|
||||
from sentence_transformers import CrossEncoder
|
||||
from sentence_transformers import SentenceTransformer, CrossEncoder
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.models import BaseEncoder
|
||||
@@ -64,7 +64,9 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
|
||||
return merged_dict
|
||||
|
||||
|
||||
def load_model(model_name: str, model_type, model_dir=None, device: str = None) -> Union[BaseEncoder, CrossEncoder]:
|
||||
def load_model(
|
||||
model_name: str, model_type, model_dir=None, device: str = None
|
||||
) -> Union[BaseEncoder, SentenceTransformer, CrossEncoder]:
|
||||
"Load model from disk or huggingface"
|
||||
# Construct model path
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -20,7 +20,7 @@ def load_jsonl(input_path):
|
||||
# Open JSONL file
|
||||
if input_path.suffix == ".gz":
|
||||
jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8")
|
||||
elif input_path.suffix == ".jsonl":
|
||||
else:
|
||||
jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8")
|
||||
|
||||
# Read JSONL file
|
||||
@@ -36,17 +36,6 @@ def load_jsonl(input_path):
|
||||
return data
|
||||
|
||||
|
||||
def dump_jsonl(jsonl_data, output_path):
|
||||
"Write List of JSON objects to JSON line file"
|
||||
# Create output directory, if it doesn't exist
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(jsonl_data)
|
||||
|
||||
logger.debug(f"Wrote jsonl data to {output_path}")
|
||||
|
||||
|
||||
def compress_jsonl_data(jsonl_data, output_path):
|
||||
# Create output directory, if it doesn't exist
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -119,10 +119,11 @@ class AppConfig(ConfigBase):
|
||||
|
||||
|
||||
class FullConfig(ConfigBase):
|
||||
content_type: Optional[ContentConfig]
|
||||
search_type: Optional[SearchConfig]
|
||||
processor: Optional[ProcessorConfig]
|
||||
content_type: Optional[ContentConfig] = None
|
||||
search_type: Optional[SearchConfig] = None
|
||||
processor: Optional[ProcessorConfig] = None
|
||||
app: Optional[AppConfig] = AppConfig(should_log_telemetry=True)
|
||||
version: Optional[str] = None
|
||||
|
||||
|
||||
class SearchResponse(ConfigBase):
|
||||
|
||||
@@ -9,13 +9,14 @@ from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import config as utils_config
|
||||
from khoj.utils.config import SearchModels, ProcessorConfigModel
|
||||
from khoj.utils.config import ContentIndex, SearchModels, ProcessorConfigModel
|
||||
from khoj.utils.helpers import LRU
|
||||
from khoj.utils.rawconfig import FullConfig
|
||||
|
||||
# Application Global State
|
||||
config = FullConfig()
|
||||
model = SearchModels()
|
||||
search_models = SearchModels()
|
||||
content_index = ContentIndex()
|
||||
processor_config = ProcessorConfigModel()
|
||||
config_file: Path = None
|
||||
verbose: int = 0
|
||||
@@ -23,7 +24,7 @@ host: str = None
|
||||
port: int = None
|
||||
cli_args: List[str] = None
|
||||
query_cache = LRU()
|
||||
search_index_lock = threading.Lock()
|
||||
config_lock = threading.Lock()
|
||||
SearchType = utils_config.SearchType
|
||||
telemetry: List[Dict[str, str]] = []
|
||||
previous_query: str = None
|
||||
|
||||
@@ -10,6 +10,7 @@ from khoj.main import app
|
||||
from khoj.configure import configure_processor, configure_routes, configure_search_types
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils.config import ImageContent, SearchModels, TextContent
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
from khoj.utils.rawconfig import (
|
||||
ContentConfig,
|
||||
@@ -41,50 +42,66 @@ def search_config() -> SearchConfig:
|
||||
encoder="sentence-transformers/all-MiniLM-L6-v2",
|
||||
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
model_directory=model_dir / "symmetric/",
|
||||
encoder_type=None,
|
||||
)
|
||||
|
||||
search_config.asymmetric = TextSearchConfig(
|
||||
encoder="sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
||||
cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
model_directory=model_dir / "asymmetric/",
|
||||
encoder_type=None,
|
||||
)
|
||||
|
||||
search_config.image = ImageSearchConfig(
|
||||
encoder="sentence-transformers/clip-ViT-B-32", model_directory=model_dir / "image/"
|
||||
encoder="sentence-transformers/clip-ViT-B-32",
|
||||
model_directory=model_dir / "image/",
|
||||
encoder_type=None,
|
||||
)
|
||||
|
||||
return search_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def content_config(tmp_path_factory, search_config: SearchConfig):
|
||||
def search_models(search_config: SearchConfig):
|
||||
search_models = SearchModels()
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
return search_models
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def content_config(tmp_path_factory, search_models: SearchModels, search_config: SearchConfig):
|
||||
content_dir = tmp_path_factory.mktemp("content")
|
||||
|
||||
# Generate Image Embeddings from Test Images
|
||||
content_config = ContentConfig()
|
||||
content_config.image = ImageContentConfig(
|
||||
input_filter=None,
|
||||
input_directories=["tests/data/images"],
|
||||
embeddings_file=content_dir.joinpath("image_embeddings.pt"),
|
||||
batch_size=1,
|
||||
use_xmp_metadata=False,
|
||||
)
|
||||
|
||||
image_search.setup(content_config.image, search_config.image, regenerate=False)
|
||||
image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False)
|
||||
|
||||
# Generate Notes Embeddings from Test Notes
|
||||
content_config.org = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/org/*.org"],
|
||||
compressed_jsonl=content_dir.joinpath("notes.jsonl"),
|
||||
compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
|
||||
)
|
||||
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
||||
)
|
||||
|
||||
content_config.plugins = {
|
||||
"plugin1": TextContentConfig(
|
||||
input_files=[content_dir.joinpath("notes.jsonl")],
|
||||
input_files=[content_dir.joinpath("notes.jsonl.gz")],
|
||||
input_filter=None,
|
||||
compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
|
||||
@@ -106,7 +123,11 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
|
||||
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(
|
||||
JsonlToJsonl, content_config.plugins["plugin1"], search_config.asymmetric, regenerate=False, filters=filters
|
||||
JsonlToJsonl,
|
||||
content_config.plugins["plugin1"],
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
return content_config
|
||||
@@ -121,7 +142,7 @@ def md_content_config(tmp_path_factory):
|
||||
content_config.markdown = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/markdown/*.markdown"],
|
||||
compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
|
||||
compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
||||
)
|
||||
|
||||
@@ -157,8 +178,13 @@ def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, p
|
||||
|
||||
# Index Markdown Content for Search
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
state.model.markdown_search = text_search.setup(
|
||||
MarkdownToJsonl, md_content_config.markdown, search_config.asymmetric, regenerate=False, filters=filters
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.content_index.markdown = text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
md_content_config.markdown,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
# Initialize Processor from Config
|
||||
@@ -175,8 +201,14 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor
|
||||
state.SearchType = configure_search_types(state.config)
|
||||
|
||||
# These lines help us Mock the Search models for these search types
|
||||
state.model.org_search = {}
|
||||
state.model.image_search = {}
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
state.content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, state.search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
state.content_index.image = image_search.setup(
|
||||
content_config.image, state.search_models.image_search, regenerate=False
|
||||
)
|
||||
|
||||
configure_routes(app)
|
||||
return TestClient(app)
|
||||
|
||||
@@ -27,9 +27,9 @@
|
||||
- Run ~M-x khoj <user-query>~ or Call ~C-c C-s~
|
||||
|
||||
- *Khoj via API*
|
||||
- Query: ~GET~ [[http://localhost:8000/api/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:8000/api/search?q="What is the meaning of life"]]
|
||||
- Update Index: ~GET~ [[http://localhost:8000/api/update][http://localhost:8000/api/update]]
|
||||
- [[http://localhost:8000/docs][Khoj API Docs]]
|
||||
- Query: ~GET~ [[http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:42110/api/search?q="What is the meaning of life"]]
|
||||
- Update Index: ~GET~ [[http://localhost:42110/api/update][http://localhost:42110/api/update]]
|
||||
- [[http://localhost:42110/docs][Khoj API Docs]]
|
||||
|
||||
- *Call Khoj via Python Script Directly*
|
||||
#+begin_src shell
|
||||
|
||||
@@ -11,7 +11,8 @@ from fastapi.testclient import TestClient
|
||||
from khoj.main import app
|
||||
from khoj.configure import configure_routes, configure_search_types
|
||||
from khoj.utils import state
|
||||
from khoj.utils.state import model, config
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.state import search_models, content_index, config
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
@@ -34,11 +35,11 @@ def test_search_with_invalid_content_type(client):
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_search_with_valid_content_type(client):
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "plugin1"]:
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/search?q=random&t={content_type}")
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@@ -52,11 +53,11 @@ def test_update_with_invalid_content_type(client):
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "image", "pdf", "plugin1"]:
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?t={content_type}")
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@@ -70,11 +71,11 @@ def test_regenerate_with_invalid_content_type(client):
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "image", "pdf", "plugin1"]:
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?force=true&t={content_type}")
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@@ -143,7 +144,10 @@ def test_get_configured_types_with_no_content_config():
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_search(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
query_expected_image_pairs = [
|
||||
("kitten", "kitten_park.jpg"),
|
||||
("a horse and dog on a leash", "horse_dog.jpg"),
|
||||
@@ -166,7 +170,10 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.org_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
user_query = quote("How to git install application?")
|
||||
|
||||
# Act
|
||||
@@ -183,8 +190,9 @@ def test_notes_search(client, content_config: ContentConfig, search_config: Sear
|
||||
def test_notes_search_with_only_filters(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter(), FileFilter()]
|
||||
model.org_search = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
||||
)
|
||||
user_query = quote('+"Emacs" file:"*.org"')
|
||||
|
||||
@@ -202,8 +210,9 @@ def test_notes_search_with_only_filters(client, content_config: ContentConfig, s
|
||||
def test_notes_search_with_include_filter(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
model.org_search = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search, regenerate=False, filters=filters
|
||||
)
|
||||
user_query = quote('How to git install application? +"Emacs"')
|
||||
|
||||
@@ -221,8 +230,9 @@ def test_notes_search_with_include_filter(client, content_config: ContentConfig,
|
||||
def test_notes_search_with_exclude_filter(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
model.org_search = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
||||
)
|
||||
user_query = quote('How to git install application? -"clone"')
|
||||
|
||||
|
||||
@@ -5,9 +5,10 @@ from PIL import Image
|
||||
|
||||
# External Packages
|
||||
import pytest
|
||||
from khoj.utils.config import SearchModels
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.state import model
|
||||
from khoj.utils.state import content_index, search_models
|
||||
from khoj.utils.constants import web_directory
|
||||
from khoj.search_type import image_search
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
@@ -16,10 +17,12 @@ from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_search_setup(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_image_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
||||
# Act
|
||||
# Regenerate image search embeddings during image setup
|
||||
image_search_model = image_search.setup(content_config.image, search_config.image, regenerate=True)
|
||||
image_search_model = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(image_search_model.image_names) == 3
|
||||
@@ -54,8 +57,11 @@ def test_image_metadata(content_config: ContentConfig):
|
||||
@pytest.mark.anyio
|
||||
async def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
output_directory = resolve_absolute_path(web_directory)
|
||||
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
|
||||
query_expected_image_pairs = [
|
||||
("kitten", "kitten_park.jpg"),
|
||||
("horse and dog in a farm", "horse_dog.jpg"),
|
||||
@@ -64,11 +70,13 @@ async def test_image_search(content_config: ContentConfig, search_config: Search
|
||||
|
||||
# Act
|
||||
for query, expected_image_name in query_expected_image_pairs:
|
||||
hits = await image_search.query(query, count=1, model=model.image_search)
|
||||
hits = await image_search.query(
|
||||
query, count=1, search_model=search_models.image_search, content=content_index.image
|
||||
)
|
||||
|
||||
results = image_search.collate_results(
|
||||
hits,
|
||||
model.image_search.image_names,
|
||||
content_index.image.image_names,
|
||||
output_directory=output_directory,
|
||||
image_files_url="/static/images",
|
||||
count=1,
|
||||
@@ -90,7 +98,10 @@ async def test_image_search(content_config: ContentConfig, search_config: Search
|
||||
@pytest.mark.anyio
|
||||
async def test_image_search_query_truncated(content_config: ContentConfig, search_config: SearchConfig, caplog):
|
||||
# Arrange
|
||||
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
max_words_supported = 10
|
||||
query = " ".join(["hello"] * 100)
|
||||
truncated_query = " ".join(["hello"] * max_words_supported)
|
||||
@@ -98,7 +109,9 @@ async def test_image_search_query_truncated(content_config: ContentConfig, searc
|
||||
# Act
|
||||
try:
|
||||
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
|
||||
await image_search.query(query, count=1, model=model.image_search)
|
||||
await image_search.query(
|
||||
query, count=1, search_model=search_models.image_search, content=content_index.image
|
||||
)
|
||||
# Assert
|
||||
except RuntimeError as e:
|
||||
if "The size of tensor a (102) must match the size of tensor b (77)" in str(e):
|
||||
@@ -110,8 +123,11 @@ async def test_image_search_query_truncated(content_config: ContentConfig, searc
|
||||
@pytest.mark.anyio
|
||||
async def test_image_search_by_filepath(content_config: ContentConfig, search_config: SearchConfig, caplog):
|
||||
# Arrange
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
output_directory = resolve_absolute_path(web_directory)
|
||||
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
|
||||
image_directory = content_config.image.input_directories[0]
|
||||
|
||||
query = f"file:{image_directory.joinpath('kitten_park.jpg')}"
|
||||
@@ -119,11 +135,13 @@ async def test_image_search_by_filepath(content_config: ContentConfig, search_co
|
||||
|
||||
# Act
|
||||
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
|
||||
hits = await image_search.query(query, count=1, model=model.image_search)
|
||||
hits = await image_search.query(
|
||||
query, count=1, search_model=search_models.image_search, content=content_index.image
|
||||
)
|
||||
|
||||
results = image_search.collate_results(
|
||||
hits,
|
||||
model.image_search.image_names,
|
||||
content_index.image.image_names,
|
||||
output_directory=output_directory,
|
||||
image_files_url="/static/images",
|
||||
count=1,
|
||||
|
||||
@@ -5,9 +5,11 @@ import os
|
||||
|
||||
# External Packages
|
||||
import pytest
|
||||
import torch
|
||||
from khoj.utils.config import SearchModels
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.state import model
|
||||
from khoj.utils.state import content_index, search_models
|
||||
from khoj.search_type import text_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
@@ -16,7 +18,7 @@ from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_setup_with_missing_file_raises_error(
|
||||
def test_text_search_setup_with_missing_file_raises_error(
|
||||
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
||||
):
|
||||
# Arrange
|
||||
@@ -31,7 +33,7 @@ def test_asymmetric_setup_with_missing_file_raises_error(
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_setup_with_empty_file_raises_error(
|
||||
def test_text_search_setup_with_empty_file_raises_error(
|
||||
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
||||
):
|
||||
# Act
|
||||
@@ -41,10 +43,12 @@ def test_asymmetric_setup_with_empty_file_raises_error(
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
||||
# Act
|
||||
# Regenerate notes embeddings during asymmetric setup
|
||||
notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(notes_model.entries) == 10
|
||||
@@ -52,34 +56,39 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_text_content_index_only_updates_on_changes(content_config: ContentConfig, search_config: SearchConfig, caplog):
|
||||
def test_text_index_same_if_content_unchanged(content_config: ContentConfig, search_models: SearchModels, caplog):
|
||||
# Arrange
|
||||
caplog.set_level(logging.INFO, logger="khoj")
|
||||
|
||||
# Act
|
||||
# Generate initial notes embeddings during asymmetric setup
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True)
|
||||
initial_logs = caplog.text
|
||||
caplog.clear() # Clear logs
|
||||
|
||||
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False)
|
||||
final_logs = caplog.text
|
||||
|
||||
# Assert
|
||||
assert "📩 Saved computed text embeddings to" in initial_logs
|
||||
assert "📩 Saved computed text embeddings to" not in final_logs
|
||||
assert "Creating index from scratch." in initial_logs
|
||||
assert "Creating index from scratch." not in final_logs
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
async def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.notes_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
query = "How to git install application?"
|
||||
|
||||
# Act
|
||||
hits, entries = await text_search.query(query, model=model.notes_search, rank_results=True)
|
||||
hits, entries = await text_search.query(
|
||||
query, search_model=search_models.text_search, content=content_index.org, rank_results=True
|
||||
)
|
||||
|
||||
results = text_search.collate_results(hits, entries, count=1)
|
||||
|
||||
@@ -90,7 +99,7 @@ async def test_asymmetric_search(content_config: ContentConfig, search_config: S
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig):
|
||||
def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContentConfig, search_models: SearchModels):
|
||||
# Arrange
|
||||
# Insert org-mode entry with size exceeding max token limit to new org file
|
||||
max_tokens = 256
|
||||
@@ -103,7 +112,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent
|
||||
# Act
|
||||
# reload embeddings, entries, notes model after adding new org-mode file
|
||||
initial_notes_model = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=False
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
@@ -113,9 +122,13 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig, new_org_file: Path):
|
||||
def test_regenerate_index_with_new_entry(
|
||||
content_config: ContentConfig, search_models: SearchModels, new_org_file: Path
|
||||
):
|
||||
# Arrange
|
||||
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
initial_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
assert len(initial_notes_model.entries) == 10
|
||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||
@@ -125,23 +138,20 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||
with open(new_org_file, "w") as f:
|
||||
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
||||
|
||||
# Act
|
||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||
regenerated_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# Act
|
||||
# reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
|
||||
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
|
||||
# Assert
|
||||
assert len(regenerated_notes_model.entries) == 11
|
||||
assert len(regenerated_notes_model.corpus_embeddings) == 11
|
||||
|
||||
# Assert
|
||||
# verify new entry loaded from updated embeddings, entries
|
||||
assert len(initial_notes_model.entries) == 11
|
||||
assert len(initial_notes_model.corpus_embeddings) == 11
|
||||
# verify new entry appended to index, without disrupting order or content of existing entries
|
||||
error_details = compare_index(initial_notes_model, regenerated_notes_model)
|
||||
if error_details:
|
||||
pytest.fail(error_details, False)
|
||||
|
||||
# Cleanup
|
||||
# reset input_files in config to empty list
|
||||
@@ -149,26 +159,101 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig, new_org_file: Path):
|
||||
def test_update_index_with_duplicate_entries_in_stable_order(
|
||||
org_config_with_only_new_file: TextContentConfig, search_models: SearchModels
|
||||
):
|
||||
# Arrange
|
||||
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
|
||||
|
||||
assert len(initial_notes_model.entries) == 10
|
||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||
# Insert org-mode entries with same compiled form into new org file
|
||||
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||
with open(new_file_to_index, "w") as f:
|
||||
f.write(f"{new_entry}{new_entry}")
|
||||
|
||||
# Act
|
||||
# load embeddings, entries, notes model after adding new org-mode file
|
||||
initial_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# update embeddings, entries, notes model after adding new org-mode file
|
||||
updated_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
# verify only 1 entry added even if there are multiple duplicate entries
|
||||
assert len(initial_index.entries) == len(updated_index.entries) == 1
|
||||
assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) == 1
|
||||
|
||||
# verify the same entry is added even when there are multiple duplicate entries
|
||||
error_details = compare_index(initial_index, updated_index)
|
||||
if error_details:
|
||||
pytest.fail(error_details)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextContentConfig, search_models: SearchModels):
|
||||
# Arrange
|
||||
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
|
||||
|
||||
# Insert org-mode entries with same compiled form into new org file
|
||||
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||
with open(new_file_to_index, "w") as f:
|
||||
f.write(f"{new_entry}{new_entry} -- Tatooine")
|
||||
|
||||
# load embeddings, entries, notes model after adding new org file with 2 entries
|
||||
initial_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# update embeddings, entries, notes model after removing an entry from the org file
|
||||
with open(new_file_to_index, "w") as f:
|
||||
f.write(f"{new_entry}")
|
||||
|
||||
# Act
|
||||
updated_index = text_search.setup(
|
||||
OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
# verify only 1 entry added even if there are multiple duplicate entries
|
||||
assert len(initial_index.entries) == len(updated_index.entries) + 1
|
||||
assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) + 1
|
||||
|
||||
# verify the same entry is added even when there are multiple duplicate entries
|
||||
error_details = compare_index(updated_index, initial_index)
|
||||
if error_details:
|
||||
pytest.fail(error_details)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
|
||||
# Arrange
|
||||
initial_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False
|
||||
)
|
||||
|
||||
# append org-mode entry to first org input file in config
|
||||
with open(new_org_file, "w") as f:
|
||||
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
||||
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||
f.write(new_entry)
|
||||
|
||||
# Act
|
||||
# update embeddings, entries with the newly added note
|
||||
content_config.org.input_files = [f"{new_org_file}"]
|
||||
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
final_notes_model = text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False
|
||||
)
|
||||
|
||||
# Assert
|
||||
# verify new entry added in updated embeddings, entries
|
||||
assert len(initial_notes_model.entries) == 11
|
||||
assert len(initial_notes_model.corpus_embeddings) == 11
|
||||
assert len(final_notes_model.entries) == len(initial_notes_model.entries) + 1
|
||||
assert len(final_notes_model.corpus_embeddings) == len(initial_notes_model.corpus_embeddings) + 1
|
||||
|
||||
# verify new entry appended to index, without disrupting order or content of existing entries
|
||||
error_details = compare_index(initial_notes_model, final_notes_model)
|
||||
if error_details:
|
||||
pytest.fail(error_details, False)
|
||||
|
||||
# Cleanup
|
||||
# reset input_files in config to empty list
|
||||
@@ -177,10 +262,34 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
|
||||
def test_asymmetric_setup_github(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_asymmetric_setup_github(content_config: ContentConfig, search_models: SearchModels):
|
||||
# Act
|
||||
# Regenerate github embeddings to test asymmetric setup without caching
|
||||
github_model = text_search.setup(GithubToJsonl, content_config.github, search_config.asymmetric, regenerate=True)
|
||||
github_model = text_search.setup(
|
||||
GithubToJsonl, content_config.github, search_models.text_search.bi_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(github_model.entries) > 1
|
||||
|
||||
|
||||
def compare_index(initial_notes_model, final_notes_model):
|
||||
mismatched_entries, mismatched_embeddings = [], []
|
||||
for index in range(len(initial_notes_model.entries)):
|
||||
if initial_notes_model.entries[index].to_json() != final_notes_model.entries[index].to_json():
|
||||
mismatched_entries.append(index)
|
||||
|
||||
# verify new entry embedding appended to embeddings tensor, without disrupting order or content of existing embeddings
|
||||
for index in range(len(initial_notes_model.corpus_embeddings)):
|
||||
if not torch.equal(final_notes_model.corpus_embeddings[index], initial_notes_model.corpus_embeddings[index]):
|
||||
mismatched_embeddings.append(index)
|
||||
|
||||
error_details = ""
|
||||
if mismatched_entries:
|
||||
mismatched_entries_str = ",".join(map(str, mismatched_entries))
|
||||
error_details += f"Entries at {mismatched_entries_str} not equal\n"
|
||||
if mismatched_embeddings:
|
||||
mismatched_embeddings_str = ", ".join(map(str, mismatched_embeddings))
|
||||
error_details += f"Embeddings at {mismatched_embeddings_str} not equal\n"
|
||||
|
||||
return error_details
|
||||
|
||||
@@ -12,5 +12,6 @@
|
||||
"0.7.1": "0.15.0",
|
||||
"0.8.0": "0.15.0",
|
||||
"0.8.1": "0.15.0",
|
||||
"0.8.2": "0.15.0"
|
||||
"0.8.2": "0.15.0",
|
||||
"0.9.0": "0.15.0"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user