mirror of
https://github.com/khoj-ai/khoj.git
synced 2026-05-13 21:41:41 +00:00
Compare commits
150 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
35aa06067f | ||
|
|
8f4e5d3d83 | ||
|
|
5673bd5b96 | ||
|
|
a2ab68a7a2 | ||
|
|
67129964a7 | ||
|
|
d3257cb24e | ||
|
|
40091489c0 | ||
|
|
240db7b4f0 | ||
|
|
234be96e53 | ||
|
|
53d421f9c6 | ||
|
|
c8c0cfd10e | ||
|
|
7ecae224e7 | ||
|
|
3d616c8d65 | ||
|
|
23bd737f6b | ||
|
|
81e98c3079 | ||
|
|
59ff1ae27f | ||
|
|
001ac7b5eb | ||
|
|
112f388ada | ||
|
|
1d3d949962 | ||
|
|
cd46a17e5f | ||
|
|
c0972e09e6 | ||
|
|
64fff1d372 | ||
|
|
7478d08803 | ||
|
|
fc218508f9 | ||
|
|
87090531da | ||
|
|
83a7ccd729 | ||
|
|
5c2327ee4f | ||
|
|
6e8a40906d | ||
|
|
526a927bce | ||
|
|
7243059507 | ||
|
|
8a9055f918 | ||
|
|
ae535a06eb | ||
|
|
36b17d4ae0 | ||
|
|
924424c754 | ||
|
|
359a2cacef | ||
|
|
d7fb9a596e | ||
|
|
8a21aff438 | ||
|
|
cb40a96c85 | ||
|
|
50760acc37 | ||
|
|
82eb4bfd0d | ||
|
|
99d19dcf43 | ||
|
|
c92d79118a | ||
|
|
e281a498b4 | ||
|
|
4f655d20ae | ||
|
|
f6ff7b1beb | ||
|
|
285a2b86d2 | ||
|
|
67c850a4ac | ||
|
|
0aebf624fc | ||
|
|
ff846f05c5 | ||
|
|
7e36f421f9 | ||
|
|
4725416fbd | ||
|
|
508b2176b7 | ||
|
|
b08745b541 | ||
|
|
27217a330d | ||
|
|
5e9558d39d | ||
|
|
cf28f104c7 | ||
|
|
93e2aff786 | ||
|
|
d78454d4ad | ||
|
|
4070d13a96 | ||
|
|
863933daaa | ||
|
|
e9ca04af0d | ||
|
|
06df394d6c | ||
|
|
364e6c11af | ||
|
|
36b52fdd0a | ||
|
|
72f63a6ef7 | ||
|
|
e4d67694e1 | ||
|
|
98e5ea4940 | ||
|
|
2f6284872d | ||
|
|
a9b81975f2 | ||
|
|
b351cfb8a0 | ||
|
|
601ff2541b | ||
|
|
e28526bbc9 | ||
|
|
939d7731da | ||
|
|
f63fd0995e | ||
|
|
10836dedee | ||
|
|
08f5fb315f | ||
|
|
f09bdd515b | ||
|
|
36c7389b46 | ||
|
|
2600cc9d4d | ||
|
|
45cb510421 | ||
|
|
d871e04a81 | ||
|
|
1a5d1130f4 | ||
|
|
d0f14d3f85 | ||
|
|
dfb277ee37 | ||
|
|
e75e13d788 | ||
|
|
4e15b4e411 | ||
|
|
1b4d562700 | ||
|
|
b6d63137f1 | ||
|
|
3f719c9e17 | ||
|
|
7526a50dd4 | ||
|
|
7c4d546039 | ||
|
|
c1128a1ad8 | ||
|
|
9306cd901a | ||
|
|
24ddebf3ce | ||
|
|
8609e3129e | ||
|
|
6c0e82b2d6 | ||
|
|
cccd225247 | ||
|
|
b9caad458e | ||
|
|
198d9af8cf | ||
|
|
a71f168273 | ||
|
|
bcc0bed9db | ||
|
|
8bb8824d0c | ||
|
|
e16d0b6d7e | ||
|
|
c3c7b8a951 | ||
|
|
3838f9d8e3 | ||
|
|
f7b8cdd02e | ||
|
|
2739a492b4 | ||
|
|
87d1e1341d | ||
|
|
280061e1fa | ||
|
|
672f61529e | ||
|
|
4fb628975c | ||
|
|
b6cdc5c7cb | ||
|
|
7f994274bb | ||
|
|
d73042426d | ||
|
|
45f461d175 | ||
|
|
7cad1c9428 | ||
|
|
ad1f1cf620 | ||
|
|
9d42b5d60d | ||
|
|
c3b624e351 | ||
|
|
7184508784 | ||
|
|
211e460398 | ||
|
|
c823f46d89 | ||
|
|
b6dbe4dd1d | ||
|
|
1ae40163a9 | ||
|
|
fe03ba3dce | ||
|
|
ed177db2be | ||
|
|
7ad251b8ef | ||
|
|
2bed4c3b50 | ||
|
|
8914dbd073 | ||
|
|
e77a5ffc83 | ||
|
|
b09350c052 | ||
|
|
b177adf3a7 | ||
|
|
ede6eb6879 | ||
|
|
88a9eadfba | ||
|
|
ab501a56c9 | ||
|
|
f944408e69 | ||
|
|
88344f9ed2 | ||
|
|
c2814fce58 | ||
|
|
f3f24387ec | ||
|
|
1e43f1a12e | ||
|
|
9d38eadd42 | ||
|
|
68bd5d9ebc | ||
|
|
d91c7e2761 | ||
|
|
47b58a2a4d | ||
|
|
ab0d3a08e2 | ||
|
|
55a032e8c4 | ||
|
|
fcbbe8c759 | ||
|
|
f57d7bf5ad | ||
|
|
fada617faa | ||
|
|
61b6ee2857 |
@@ -6,4 +6,5 @@ docs/
|
||||
tests/
|
||||
build/
|
||||
dist/
|
||||
scripts/
|
||||
*.egg-info/
|
||||
|
||||
4
.github/workflows/build_khoj_el.yml
vendored
4
.github/workflows/build_khoj_el.yml
vendored
@@ -24,13 +24,13 @@ jobs:
|
||||
- name: Set up Python 3.9
|
||||
uses: actions/setup-python@v1
|
||||
with: { python-version: 3.9 }
|
||||
- name: Install
|
||||
- name: ⏬️ Install Dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
sudo apt-get install emacs && emacs --version
|
||||
git clone https://github.com/riscy/melpazoid.git ~/melpazoid
|
||||
pip install ~/melpazoid
|
||||
- name: Run
|
||||
- name: 🌡️ Validate Khoj.el
|
||||
env:
|
||||
# Khoj recipe from https://github.com/melpa/melpa/pull/8321/files
|
||||
RECIPE: (khoj :fetcher github :repo "debanjum/khoj" :files ("src/interface/emacs/*.el"))
|
||||
|
||||
2
.github/workflows/dockerize.yml
vendored
2
.github/workflows/dockerize.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.PAT }}
|
||||
|
||||
- name: Build and Push Docker Image
|
||||
- name: 📦 Build and Push Docker Image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
|
||||
11
.github/workflows/pypi.yml
vendored
11
.github/workflows/pypi.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install Application
|
||||
- name: ⬇️ Install Application
|
||||
run: python -m pip install --upgrade pip && pip install --upgrade .
|
||||
|
||||
- name: ⚙️ Build Python Package
|
||||
@@ -45,7 +45,7 @@ jobs:
|
||||
# Build PyPi Package
|
||||
pipx run build
|
||||
|
||||
- name: 👀 Validate Python Package
|
||||
- name: 🌡️ Validate Python Package
|
||||
run: |
|
||||
# Validate PyPi Package
|
||||
pipx run check-wheel-contents dist/*.whl
|
||||
@@ -62,10 +62,3 @@ jobs:
|
||||
uses: pypa/gh-action-pypi-publish@v1.6.4
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_KEY }}
|
||||
|
||||
- name: 📦 Publish Python Package to Test PyPI
|
||||
if: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
|
||||
uses: pypa/gh-action-pypi-publish@v1.6.4
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_KEY }}
|
||||
repository_url: https://test.pypi.org/legacy/
|
||||
|
||||
32
.github/workflows/release.yml
vendored
32
.github/workflows/release.yml
vendored
@@ -13,7 +13,7 @@ on:
|
||||
|
||||
jobs:
|
||||
publish_obsidian_plugin:
|
||||
name: Publish Obsidian Plugin
|
||||
name: 💎 Publish Obsidian Plugin
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
@@ -27,26 +27,33 @@ jobs:
|
||||
with:
|
||||
node-version: "lts/*"
|
||||
|
||||
- name: Build Obsidian Plugin
|
||||
- name: ⚙️ Build Obsidian Plugin
|
||||
run: |
|
||||
yarn
|
||||
yarn run build --if-present
|
||||
|
||||
- name: Upload Obsidian Plugin main.js
|
||||
- name: ⏫ Upload Obsidian Plugin main.js
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
if-no-files-found: error
|
||||
name: main.js
|
||||
path: src/interface/obsidian/main.js
|
||||
|
||||
- name: Upload Obsidian Plugin manifest.json
|
||||
- name: ⏫ Upload Obsidian Plugin manifest.json
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
if-no-files-found: error
|
||||
name: manifest.json
|
||||
path: src/interface/obsidian/manifest.json
|
||||
|
||||
- name: Create Release
|
||||
- name: ⏫ Upload Obsidian Plugin styles.css
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
if-no-files-found: error
|
||||
name: styles.css
|
||||
path: src/interface/obsidian/styles.css
|
||||
|
||||
- name: 🌈 Create Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
with:
|
||||
@@ -54,9 +61,10 @@ jobs:
|
||||
files: |
|
||||
src/interface/obsidian/main.js
|
||||
src/interface/obsidian/manifest.json
|
||||
src/interface/obsidian/styles.css
|
||||
|
||||
publish_desktop_apps:
|
||||
name: Publish Desktop Apps
|
||||
name: 🖥️ Publish Desktop Apps
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -75,7 +83,7 @@ jobs:
|
||||
with:
|
||||
python-version: '3.9'
|
||||
|
||||
- name: Install Dependencies
|
||||
- name: ⏬️ Install Dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
@@ -84,11 +92,11 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install pyinstaller
|
||||
|
||||
- name: Install Khoj App
|
||||
- name: ⬇️ Install Khoj App
|
||||
run: |
|
||||
pip install --upgrade .
|
||||
|
||||
- name: Package Khoj App
|
||||
- name: 📦 Package Khoj App
|
||||
shell: bash
|
||||
run: |
|
||||
# Setup Environment for Reproducible Builds
|
||||
@@ -100,7 +108,7 @@ jobs:
|
||||
mv dist/Khoj.exe dist/khoj_"$GITHUB_REF_NAME"_amd64.exe
|
||||
fi
|
||||
|
||||
- name: Create Mac App DMG
|
||||
- name: 💻 Create Mac App DMG
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
# Install Mac DMG Creator
|
||||
@@ -124,7 +132,7 @@ jobs:
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
with:
|
||||
ruby-version: '3.0'
|
||||
- name: Create Debian Package
|
||||
- name: 🐧 Create Debian Package
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
shell: bash
|
||||
env:
|
||||
@@ -154,7 +162,7 @@ jobs:
|
||||
name: khoj_${{github.ref_name}}_amd64.${{matrix.extension}}
|
||||
path: dist/khoj_${{github.ref_name}}_amd64.${{matrix.extension}}
|
||||
|
||||
- name: Release
|
||||
- name: 🌈 Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
with:
|
||||
|
||||
8
.github/workflows/test.yml
vendored
8
.github/workflows/test.yml
vendored
@@ -41,16 +41,16 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
|
||||
- name: Install Dependencies
|
||||
- name: ⏬️ Install Dependencies
|
||||
run: |
|
||||
sudo apt update && sudo apt install -y libegl1
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
- name: Install Application
|
||||
- name: ⬇️ Install Application
|
||||
run: pip install --upgrade .[dev]
|
||||
|
||||
- name: Validate Application
|
||||
- name: 🌡️ Validate Application
|
||||
run: pre-commit run --hook-stage manual --all
|
||||
|
||||
- name: Test Application
|
||||
- name: 🧪 Test Application
|
||||
run: pytest
|
||||
|
||||
7
.github/workflows/test_khoj_el.yml
vendored
7
.github/workflows/test_khoj_el.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
with:
|
||||
version: ${{ matrix.emacs_version }}
|
||||
- uses: actions/checkout@v3
|
||||
- name: Test Khoj.el
|
||||
- name: 🧪 Test Khoj.el
|
||||
run: |
|
||||
# Run ERT tests on khoj.el
|
||||
emacs -batch \
|
||||
@@ -42,7 +42,10 @@ jobs:
|
||||
(push '(\"melpa\" . \"https://melpa.org/packages/\") package-archives) \
|
||||
(package-initialize) \
|
||||
(unless package-archive-contents (package-refresh-contents)) \
|
||||
(unless (package-installed-p 'transient) (package-install 'transient)))" \
|
||||
(unless (package-installed-p 'transient) (package-install 'transient)) \
|
||||
(unless (package-installed-p 'dash) (package-install 'dash)) \
|
||||
(unless (package-installed-p 'org) (package-install 'org)) \
|
||||
)" \
|
||||
-l ert \
|
||||
-l ./src/interface/emacs/khoj.el \
|
||||
-l ./src/interface/emacs/tests/khoj-tests.el \
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -10,6 +10,8 @@ __pycache__
|
||||
.emacs.desktop*
|
||||
*.py[cod]
|
||||
.vscode
|
||||
.env
|
||||
.venv/*
|
||||
|
||||
# Build artifacts
|
||||
/src/khoj/interface/web/images
|
||||
|
||||
13
Dockerfile
13
Dockerfile
@@ -1,17 +1,14 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM python:3.10-slim-bullseye
|
||||
FROM ubuntu:kinetic
|
||||
LABEL org.opencontainers.image.source https://github.com/debanjum/khoj
|
||||
|
||||
# Install System Dependencies
|
||||
RUN apt-get update -y && \
|
||||
apt-get -y install python3-pyqt5
|
||||
|
||||
# Copy Application to Container
|
||||
COPY . /app
|
||||
WORKDIR /app
|
||||
RUN apt update -y && \
|
||||
apt -y install python3-pip python3-pyqt6
|
||||
|
||||
# Install Python Dependencies
|
||||
RUN pip install --upgrade pip && pip install --upgrade ".[dev]"
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install --upgrade --pre khoj-assistant
|
||||
|
||||
# Run the Application
|
||||
# There are more arguments required for the application to run,
|
||||
|
||||
135
README.md
135
README.md
@@ -3,7 +3,7 @@
|
||||
[](https://github.com/debanjum/khoj/pkgs/container/khoj)
|
||||
[](https://pypi.org/project/khoj-assistant/)
|
||||
|
||||
*A natural language search engine for your personal notes, transactions and images*
|
||||
*A search assistant for your second brain*
|
||||
|
||||
**Supported Plugins**
|
||||
|
||||
@@ -20,25 +20,25 @@
|
||||
- [Architecture](#Architecture)
|
||||
- [Setup](#Setup)
|
||||
- [Install](#1-Install)
|
||||
- [Configure](#2-Configure)
|
||||
- [Run](#3-Run)
|
||||
- [Run](#2-Run)
|
||||
- [Configure](#3-Configure)
|
||||
- [Install Plugins](#4-install-interface-plugins)
|
||||
- [Use](#Use)
|
||||
- [Interfaces](#Interfaces-1)
|
||||
- [Query Filters](#Query-filters)
|
||||
- [Khoj Search](#Khoj-search)
|
||||
- [Khoj Chat](#Khoj-chat)
|
||||
- [Upgrade](#Upgrade)
|
||||
- [Khoj Server](#upgrade-khoj-server)
|
||||
- [Khoj.el](#upgrade-khoj-on-emacs)
|
||||
- [Khoj Obsidian](#upgrade-khoj-on-obsidian)
|
||||
- [Uninstall Khoj](#uninstall-khoj)
|
||||
- [Uninstall](#uninstall)
|
||||
- [Troubleshoot](#Troubleshoot)
|
||||
- [Advanced Usage](#advanced-usage)
|
||||
- [Access Khoj on Mobile](#access-khoj-on-mobile)
|
||||
- [Chat with Notes](#chat-with-notes)
|
||||
- [Use OpenAI Models for Search](#use-openai-models-for-search)
|
||||
- [Search across Different Languages](#search-across-different-languages)
|
||||
- [Miscellaneous](#Miscellaneous)
|
||||
- [Setup OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
|
||||
- [Beta API](#beta-api)
|
||||
- [GPT API](#gpt-api)
|
||||
- [Performance](#Performance)
|
||||
- [Query Performance](#Query-performance)
|
||||
- [Indexing Performance](#Indexing-performance)
|
||||
@@ -53,13 +53,18 @@
|
||||
- [Credits](#Credits)
|
||||
|
||||
## Features
|
||||
|
||||
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
||||
- **Local**: Your personal data stays local. All search, indexing is done on your machine[\*](https://github.com/debanjum/khoj#beta-api)
|
||||
- **Incremental**: Incremental search for a fast, search-as-you-type experience
|
||||
- **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
|
||||
- **Multiple Sources**: Search your Org-mode and Markdown notes, Beancount transactions and Photos
|
||||
- **Multiple Interfaces**: Search from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
|
||||
- **Search**
|
||||
- **Local**: Your personal data stays local. All search and indexing is done on your machine. *Unlike chat which requires access to GPT.*
|
||||
- **Incremental**: Incremental search for a fast, search-as-you-type experience
|
||||
- **Chat**
|
||||
- **Faster answers**: Find answers faster, smoother than search. No need to manually scan through your notes to find answers.
|
||||
- **Iterative discovery**: Iteratively explore and (re-)discover your notes
|
||||
- **Assisted creativity**: Smoothly weave across answers retrieval and content generation
|
||||
- **General**
|
||||
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
||||
- **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
|
||||
- **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions and Photos
|
||||
- **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
|
||||
|
||||
## Demos
|
||||
### Khoj in Obsidian
|
||||
@@ -107,9 +112,11 @@ https://user-images.githubusercontent.com/6413477/184735169-92c78bf1-d827-4663-9
|
||||
## Setup
|
||||
These are the general setup instructions for Khoj.
|
||||
|
||||
- Check the [Khoj.el Readme](https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Setup) to setup Khoj with Emacs
|
||||
- Make sure [python](https://realpython.com/installing-python/) (version 3.10 or lower) and [pip](https://pip.pypa.io/en/stable/installation/) are installed on your machine
|
||||
- Check the [Khoj.el Readme](https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Setup) to setup Khoj with Emacs<br />
|
||||
Its simpler as it can skip the server *install*, *run* and *configure* step below.
|
||||
- Check the [Khoj Obsidian Readme](https://github.com/debanjum/khoj/tree/master/src/interface/obsidian#Setup) to setup Khoj with Obsidian<br />
|
||||
Its simpler as it can skip the configure step below.
|
||||
Its simpler as it can skip the *configure* step below.
|
||||
|
||||
### 1. Install
|
||||
|
||||
@@ -117,32 +124,42 @@ These are the general setup instructions for Khoj.
|
||||
pip install khoj-assistant
|
||||
```
|
||||
|
||||
### 2. Start App
|
||||
### 2. Run
|
||||
|
||||
```shell
|
||||
khoj
|
||||
```
|
||||
|
||||
Note: To start Khoj automatically in the background use [Task scheduler](https://www.windowscentral.com/how-create-automated-task-using-task-scheduler-windows-10) on Windows or [Cron](https://en.wikipedia.org/wiki/Cron) on Mac, Linux (e.g with `@reboot khoj`)
|
||||
|
||||
### 3. Configure
|
||||
|
||||
1. Enable content types and point to files to search in the First Run Screen that pops up on app start
|
||||
2. Click `Configure` and wait. The app will download ML models and index the content for search
|
||||
|
||||
## Use
|
||||
### Interfaces
|
||||
### 4. Install Interface Plugins
|
||||
Khoj exposes a web interface by default.<br />
|
||||
The optional steps below allow using Khoj from within an existing application like Obsidian or Emacs.
|
||||
|
||||
- **Khoj Obsidian**:<br />
|
||||
[Install](https://github.com/debanjum/khoj/tree/master/src/interface/obsidian#2-Setup-Plugin) the Khoj Obsidian plugin
|
||||
|
||||
- **Khoj Emacs**:<br />
|
||||
[Install](https://github.com/debanjum/khoj/tree/master/src/interface/emacs#2-Install-Khojel) khoj.el
|
||||
|
||||
## Use
|
||||
### Khoj Search
|
||||
- **Khoj via Obsidian**
|
||||
- [Install](https://github.com/debanjum/khoj/tree/master/src/interface/obsidian#2-Setup-Plugin) the Khoj Obsidian plugin
|
||||
- Click the *Khoj search* icon 🔎 on the [Ribbon](https://help.obsidian.md/User+interface/Workspace/Ribbon) or Search for *Khoj: Search* in the [Command Palette](https://help.obsidian.md/Plugins/Command+palette)
|
||||
- **Khoj via Emacs**
|
||||
- [Install](https://github.com/debanjum/khoj/tree/master/src/interface/emacs#installation) [khoj.el](./src/interface/emacs/khoj.el)
|
||||
- Run `M-x khoj <user-query>`
|
||||
- **Khoj via Web**
|
||||
- Open <http://localhost:8000/> via desktop interface or directly
|
||||
- **Khoj via API**
|
||||
- See the Khoj FastAPI [Swagger Docs](http://localhost:8000/docs), [ReDocs](http://localhost:8000/redocs)
|
||||
|
||||
### Query Filters
|
||||
<details><summary>Query Filters</summary>
|
||||
|
||||
Use structured query syntax to filter the natural language search results
|
||||
- **Word Filter**: Get entries that include/exclude a specified term
|
||||
- Entries that contain term_to_include: `+"term_to_include"`
|
||||
@@ -161,17 +178,41 @@ Use structured query syntax to filter the natural language search results
|
||||
- excluding words *"big"* and *"brother"*
|
||||
- that best match the natural language query *"what is the meaning of life?"*
|
||||
|
||||
</details>
|
||||
|
||||
### Khoj Chat
|
||||
#### Overview
|
||||
- Creates a personal assistant for you to inquire and engage with your notes
|
||||
- Uses [ChatGPT](https://openai.com/blog/chatgpt) and [Khoj search](#khoj-search)
|
||||
- Supports multi-turn conversations with the relevant notes for context
|
||||
- Shows reference notes used to generate a response
|
||||
- **Note**: *Your query and top notes from khoj search will be sent to OpenAI for processing*
|
||||
|
||||
#### Setup
|
||||
- [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
|
||||
|
||||
#### Use
|
||||
1. Open [/chat](http://localhost:8000/chat)[^2]
|
||||
2. Type your queries and see response by Khoj from your notes
|
||||
|
||||
#### Demo
|
||||

|
||||
|
||||
### Details
|
||||
1. Your query is used to retrieve the most relevant notes, if any, using Khoj search
|
||||
2. These notes, the last few messages and associated metadata is passed to ChatGPT along with your query for a response
|
||||
|
||||
## Upgrade
|
||||
### Upgrade Khoj Server
|
||||
```shell
|
||||
pip install --upgrade khoj-assistant
|
||||
```
|
||||
|
||||
- Note: To upgrade to the latest pre-release version of the khoj server run below command
|
||||
```shell
|
||||
# Maps to the latest commit on the master branch
|
||||
pip install --upgrade --pre khoj-assistant
|
||||
```
|
||||
*Note: To upgrade to the latest pre-release version of the khoj server run below command*
|
||||
```shell
|
||||
# Maps to the latest commit on the master branch
|
||||
pip install --upgrade --pre khoj-assistant
|
||||
```
|
||||
|
||||
### Upgrade Khoj on Emacs
|
||||
- Use your Emacs Package Manager to Upgrade
|
||||
@@ -181,7 +222,7 @@ pip install --upgrade khoj-assistant
|
||||
- Upgrade via the Community plugins tab on the settings pane in the Obsidian app
|
||||
- See the [khoj plugin readme](https://github.com/debanjum/khoj/tree/master/src/interface/obsidian#2-Setup-Plugin) for details
|
||||
|
||||
## Uninstall Khoj
|
||||
## Uninstall
|
||||
1. (Optional) Hit `Ctrl-C` in the terminal running the khoj server to stop it
|
||||
2. Delete the khoj directory in your home folder (i.e `~/.khoj` on Linux, Mac or `C:\Users\<your-username>\.khoj` on Windows)
|
||||
3. Uninstall the khoj server with `pip uninstall khoj-assistant`
|
||||
@@ -220,23 +261,6 @@ pip install --upgrade khoj-assistant
|
||||
|
||||

|
||||
|
||||
### Chat with Notes
|
||||
#### Overview
|
||||
- Provides a chat interface to inquire and engage with your notes
|
||||
- Chat Types:
|
||||
- **Summarize**: Pulls the most relevant note from your notes and summarizes it
|
||||
- **Chat**: Also does general chat. It guesses whether to give a general response or search, summarizes from your note. <br />
|
||||
E.g *"how was your day?"* will give a general response. But *When did I go surfing?* should give a response from your notes
|
||||
- **Note**: *Your query and top note from search result will be sent to OpenAI for processing*
|
||||
|
||||
#### Use
|
||||
1. [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
|
||||
2. Open [/chat?t=summarize](http://localhost:8000/chat?t=summarize)[^2]
|
||||
3. Type your queries, see summarized response by Khoj from your notes
|
||||
|
||||
#### Demo
|
||||

|
||||
|
||||
### Use OpenAI Models for Search
|
||||
#### Setup
|
||||
1. Set `encoder-type`, `encoder` and `model-directory` under `asymmetric` and/or `symmetric` `search-type` in your `khoj.yml`[^1]:
|
||||
@@ -263,11 +287,11 @@ pip install --upgrade khoj-assistant
|
||||
### Search across Different Languages
|
||||
To search for notes in multiple, different languages, you can use a [multi-lingual model](https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models).<br />
|
||||
For example, the [paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) supports [50+ languages](https://www.sbert.net/docs/pretrained_models.html#:~:text=we%20used%20the%20following%2050%2B%20languages), has good search quality and speed. To use it:
|
||||
1. Manually update `search-type > asymmetric > encoder` to `sentence-transformer/paraphrase-multilingual-MiniLM-L12-v2` in your `~/.khoj/khoj.yml` file for now. See diff of `khoj.yml` below for illustration:
|
||||
1. Manually update `search-type > asymmetric > encoder` to `paraphrase-multilingual-MiniLM-L12-v2` in your `~/.khoj/khoj.yml` file for now. See diff of `khoj.yml` below for illustration:
|
||||
```diff
|
||||
asymmetric:
|
||||
- encoder: "sentence-transformers/multi-qa-MiniLM-L6-cos-vi"
|
||||
+ encoder: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||||
- encoder: "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
|
||||
+ encoder: "paraphrase-multilingual-MiniLM-L12-v2"
|
||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
model_directory: "~/.khoj/search/asymmetric/"
|
||||
```
|
||||
@@ -279,7 +303,7 @@ pip install --upgrade khoj-assistant
|
||||
If you want, Khoj can be configured to use OpenAI for search and chat.<br />
|
||||
Add your OpenAI API to Khoj by using either of the two options below:
|
||||
- Open the Khoj desktop GUI, add your [OpenAI API key](https://beta.openai.com/account/api-keys) and click *Configure*
|
||||
Ensure khoj is started without the `--no-gui` flag. Check your system tray to see if Khoj 🦅 is minimized there.
|
||||
Ensure khoj is started **without** the `--no-gui` flag. Check your system tray to see if Khoj 🦅 is minimized there.
|
||||
- Set `openai-api-key` field under `processor.conversation` section in your `khoj.yml`[^1] to your [OpenAI API key](https://beta.openai.com/account/api-keys) and restart khoj:
|
||||
```diff
|
||||
processor:
|
||||
@@ -290,10 +314,10 @@ Add your OpenAI API to Khoj by using either of the two options below:
|
||||
conversation-logfile: "~/.khoj/processor/conversation/conversation_logs.json"
|
||||
```
|
||||
|
||||
**Warning**: *This will enable khoj to send your query and note(s) to OpenAI for processing*
|
||||
**Warning**: *This will enable Khoj to send your query and note(s) to OpenAI for processing*
|
||||
|
||||
### Beta API
|
||||
- The beta [chat](http://localhost:8000/api/beta/chat), [summarize](http://localhost:8000/api/beta/summarize) and [search](http://localhost:8000/api/beta/search) API endpoints use [OpenAI API](https://openai.com/api/)
|
||||
### GPT API
|
||||
- The [chat](http://localhost:8000/api/chat), [answer](http://localhost:8000/api/beta/answer) and [search](http://localhost:8000/api/beta/search) API endpoints use [OpenAI API](https://openai.com/api/)
|
||||
- They are disabled by default
|
||||
- To use them:
|
||||
1. [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
|
||||
@@ -332,8 +356,13 @@ Add your OpenAI API to Khoj by using either of the two options below:
|
||||
##### 1. Install
|
||||
|
||||
```shell
|
||||
# Get Khoj Code
|
||||
git clone https://github.com/debanjum/khoj && cd khoj
|
||||
|
||||
# Create, Activate Virtual Environment
|
||||
python3 -m venv .venv && source .venv/bin/activate
|
||||
|
||||
# Install Khoj for Development
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
"id": "khoj",
|
||||
"name": "Khoj",
|
||||
"version": "0.2.6",
|
||||
"minAppVersion": "0.15.0",
|
||||
"description": "Natural, Incremental Search for your Second Brain 🦅",
|
||||
"author": "Debanjum Singh Solanky",
|
||||
"authorUrl": "https://github.com/debanjum",
|
||||
"isDesktopOnly": false
|
||||
"id": "khoj",
|
||||
"name": "Khoj",
|
||||
"version": "0.6.0",
|
||||
"minAppVersion": "0.15.0",
|
||||
"description": "Natural, Incremental Search for your Second Brain 🦅",
|
||||
"author": "Debanjum Singh Solanky",
|
||||
"authorUrl": "https://github.com/debanjum",
|
||||
"isDesktopOnly": false
|
||||
}
|
||||
|
||||
@@ -40,7 +40,9 @@ dependencies = [
|
||||
"defusedxml == 0.7.1",
|
||||
"fastapi == 0.77.1",
|
||||
"jinja2 == 3.1.2",
|
||||
"openai == 0.20.0",
|
||||
"openai >= 0.27.0",
|
||||
"tiktoken >= 0.3.0",
|
||||
"tenacity >= 8.2.2",
|
||||
"pillow == 9.3.0",
|
||||
"pydantic == 1.9.1",
|
||||
"pyqt6 == 6.3.1",
|
||||
@@ -50,6 +52,7 @@ dependencies = [
|
||||
"sentence-transformers == 2.2.2",
|
||||
"torch == 1.13.1",
|
||||
"uvicorn == 0.17.6",
|
||||
"aiohttp == 3.8.4",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
@@ -71,6 +74,7 @@ dev = [
|
||||
"mypy >= 1.0.1",
|
||||
"black >= 23.1.0",
|
||||
"pre-commit >= 3.0.4",
|
||||
"freezegun >= 1.2.0",
|
||||
]
|
||||
|
||||
[tool.hatch.version]
|
||||
@@ -95,3 +99,9 @@ warn_unused_ignores = false
|
||||
|
||||
[tool.black]
|
||||
line-length = 120
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = "--strict-markers"
|
||||
markers = [
|
||||
"chatquality: Evaluate chatbot capabilities and quality",
|
||||
]
|
||||
|
||||
82
scripts/bump_version.sh
Executable file
82
scripts/bump_version.sh
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/bin/zsh
|
||||
|
||||
project_root=$PWD
|
||||
|
||||
while getopts 'nc:' opt;
|
||||
do
|
||||
case "${opt}" in
|
||||
c)
|
||||
# Get current project version
|
||||
current_version=$OPTARG
|
||||
|
||||
# Bump Obsidian plugin to current version
|
||||
cd $project_root/src/interface/obsidian
|
||||
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
|
||||
sed -E -i.bak "s/version\": \"(.*)\"/version\": \"$current_version\"/" manifest.json
|
||||
cp $project_root/versions.json .
|
||||
npm run version # append current version
|
||||
rm *.bak
|
||||
|
||||
# Bump Emacs package to current version
|
||||
cd ../emacs
|
||||
sed -E -i.bak "s/^;; Version: (.*)/;; Version: $current_version/" khoj.el
|
||||
git add khoj.el
|
||||
rm *.bak
|
||||
|
||||
# Copy current obsidian versioned files to project root
|
||||
cd $project_root
|
||||
cp src/interface/obsidian/versions.json .
|
||||
cp src/interface/obsidian/manifest.json .
|
||||
|
||||
# Run pre-commit validation to fix jsons
|
||||
pre-commit run --hook-stage manual --all
|
||||
|
||||
# Commit changes and tag commit for release
|
||||
git add \
|
||||
$project_root/src/interface/obsidian/package.json \
|
||||
$project_root/src/interface/obsidian/manifest.json \
|
||||
$project_root/src/interface/obsidian/versions.json \
|
||||
$project_root/src/interface/emacs/khoj.el \
|
||||
$project_root/manifest.json \
|
||||
$project_root/versions.json
|
||||
git commit -m "Release Khoj version $current_version"
|
||||
git tag $current_version master
|
||||
;;
|
||||
n)
|
||||
# Induce hatch to compute next version number
|
||||
# remove .dev[commits-since-tag] version suffix from hatch computed version number
|
||||
next_version=$(touch bump.txt && git add bump.txt && hatch version | sed 's/\.dev.*//g')
|
||||
git rm --cached -- bump.txt && rm bump.txt
|
||||
|
||||
# Bump Obsidian plugins to next version
|
||||
cd $project_root/src/interface/obsidian
|
||||
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$next_version\",/" package.json
|
||||
sed -E -i.bak "s/version\": \"(.*)\"/version\": \"$next_version\"/" manifest.json
|
||||
npm run version # updates versions.json
|
||||
rm *.bak
|
||||
|
||||
# Bump Emacs package to next version
|
||||
cd $project_root/src/interface/emacs
|
||||
sed -E -i.bak "s/^;; Version: (.*)/;; Version: $next_version/" khoj.el
|
||||
rm *.bak
|
||||
|
||||
# Run pre-commit validations to fix jsons
|
||||
pre-commit run --hook-stage manual --all
|
||||
|
||||
# Commit changes
|
||||
git add \
|
||||
$project_root/src/interface/obsidian/package.json \
|
||||
$project_root/src/interface/obsidian/manifest.json \
|
||||
$project_root/src/interface/obsidian/versions.json \
|
||||
$project_root/src/interface/emacs/khoj.el
|
||||
git commit -m "Bump Khoj to pre-release version $next_version"
|
||||
;;
|
||||
?)
|
||||
echo -e "Invalid command option.\nUsage: $(basename $0) [-c] [-n]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Restore State
|
||||
cd $project_root
|
||||
@@ -1,16 +1,19 @@
|
||||
* Khoj Emacs 🦅
|
||||
[[https://stable.melpa.org/#/khoj][file:https://stable.melpa.org/packages/khoj-badge.svg]] [[https://melpa.org/#/khoj][file:https://melpa.org/packages/khoj-badge.svg]] [[https://github.com/debanjum/khoj/actions/workflows/build_khoj_el.yml][https://github.com/debanjum/khoj/actions/workflows/build_khoj_el.yml/badge.svg?]] [[https://github.com/debanjum/khoj/actions/workflows/test_khoj_el.yml][https://github.com/debanjum/khoj/actions/workflows/test_khoj_el.yml/badge.svg?]]
|
||||
|
||||
/Natural, Incremental Search for your Second Brain/
|
||||
/A search assistant for your second brain/
|
||||
|
||||
** Table of Contents
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#features][Features]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Interface][Interface]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Setup][Setup]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#1-Setup-Backend][Setup Backend]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#2-Install-Khojel][Install Khoj.el]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Direct-Install][Direct Install]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Minimal-Install][Minimal Install]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Standard-Install][Standard Install]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#With-Straight.el][With Straight.el]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Use][Use]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Search][Search]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Chat][Chat]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Find-similar-entries][Find Similar Entries]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Advanced-usage][Advanced Usage]]
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Khoj-menu][Khoj Menu]]
|
||||
@@ -19,47 +22,74 @@
|
||||
- [[https://github.com/debanjum/khoj/tree/master/src/interface/emacs#Upgrade-Khojel][Upgrade Khoj.el]]
|
||||
|
||||
** Features
|
||||
- *Natural*: Advanced natural language understanding using Transformer based ML Models
|
||||
- *Local*: Your personal data stays local. All search, indexing is done on your machine*
|
||||
- *Incremental*: Incremental search for a fast, search-as-you-type experience
|
||||
- *Search*
|
||||
- *Natural*: Advanced natural language understanding using Transformer based ML Models
|
||||
- *Local*: Your personal data stays local. All search, indexing is done on your machine*
|
||||
- *Incremental*: Incremental search for a fast, search-as-you-type experience
|
||||
- *Chat*
|
||||
- *Faster answers*: Find answers faster than search
|
||||
- *Iterative discovery*: Iteratively explore and (re-)discover your notes
|
||||
- *Assisted creativity*: Smoothly weave across answer retrieval and content generation
|
||||
|
||||
** Interface
|
||||
[[/docs/khoj_on_emacs.png]]
|
||||
|
||||
** Setup
|
||||
*** 1. Setup Backend
|
||||
#+begin_src shell
|
||||
pip install khoj-assistant && khoj
|
||||
#+end_src
|
||||
- /Make sure [[https://realpython.com/installing-python/][python]] (version 3.10 or lower) and [[https://pip.pypa.io/en/stable/installation/][pip]] are installed on your machine/
|
||||
|
||||
*** 2. Install Khoj.el
|
||||
**** Using MELPA
|
||||
#+begin_src elisp
|
||||
- /khoj.el attempts to automatically install, start and configure the khoj server./
|
||||
If this fails, follow [[https://github.com/debanjum/khoj/tree/master/#Setup][these instructions]] to manually setup the khoj server.
|
||||
|
||||
*** Direct Install
|
||||
#+begin_src elisp
|
||||
M-x package-install khoj
|
||||
#+end_src elisp
|
||||
#+end_src
|
||||
|
||||
Add below snippet to your Emacs config file
|
||||
#+begin_src elisp
|
||||
;; Install Khoj Package from MELPA Stable
|
||||
(use-package khoj
|
||||
:ensure t
|
||||
:pin melpa-stable
|
||||
:bind ("C-c s" . 'khoj))
|
||||
#+end_src
|
||||
*** Minimal Install
|
||||
Add below snippet to your Emacs config file.
|
||||
Indexes your org-agenda files, by default.
|
||||
|
||||
Note: Install ~khoj.el~ from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj
|
||||
- That is, use ~:pin melpa~ to install khoj.el in above snippet if khoj was installed with ~pip install --pre khoj-assistant~
|
||||
- Else use ~:pin melpa-stable~ to install khoj.el in above snippet if khoj was installed with ~pip install khoj-assistant~
|
||||
- This ensures both khoj.el and khoj app are from the same version (tagged or latest)
|
||||
#+begin_src elisp
|
||||
;; Install Khoj Package from MELPA Stable
|
||||
(use-package khoj
|
||||
:ensure t
|
||||
:pin melpa-stable
|
||||
:bind ("C-c s" . 'khoj)
|
||||
#+end_src
|
||||
|
||||
- Note: Install ~khoj.el~ from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj
|
||||
- That is, use ~:pin melpa~ to install khoj.el in above snippet if khoj server was installed with ~--pre~ flag, i.e ~pip install --pre khoj-assistant~
|
||||
- Else use ~:pin melpa-stable~ to install khoj.el in above snippet if khoj was installed with ~pip install khoj-assistant~
|
||||
- This ensures both khoj.el and khoj app are from the same version (git tagged or latest)
|
||||
|
||||
*** Standard Install
|
||||
Add below snippet to your Emacs config file.
|
||||
Indexes the specified org files, directories. Sets up OpenAI API key for Khoj Chat
|
||||
|
||||
#+begin_src elisp
|
||||
;; Install Khoj Package from MELPA Stable
|
||||
(use-package khoj
|
||||
:ensure t
|
||||
:pin melpa-stable
|
||||
:bind ("C-c s" . 'khoj)
|
||||
:config (setq khoj-org-directories '("~/docs/org-roam" "~/docs/notes")
|
||||
khoj-org-files '("~/docs/todo.org" "~/docs/work.org")
|
||||
khoj-openai-api-key "YOUR_OPENAI_API_KEY")) ; required to enable chat
|
||||
#+end_src
|
||||
|
||||
*** With [[https://github.com/raxod502/straight.el][Straight.el]]
|
||||
Add below snippet to your Emacs config file.
|
||||
Indexes the specified org files, directories. Sets up OpenAI API key for Khoj Chat
|
||||
|
||||
**** Using [[https://github.com/raxod502/straight.el][Straight.el]]
|
||||
Add below snippet to your Emacs config file
|
||||
#+begin_src elisp
|
||||
;; Install Khoj Package using Straight.el
|
||||
(use-package khoj
|
||||
:after org
|
||||
:straight (khoj :type git :host github :repo "debanjum/khoj" :files (:defaults "src/interface/emacs/khoj.el"))
|
||||
:bind ("C-c s" . 'khoj))
|
||||
:bind ("C-c s" . 'khoj)
|
||||
:config (setq khoj-org-directories '("~/docs/org-roam" "~/docs/notes")
|
||||
khoj-org-files '("~/docs/todo.org" "~/docs/work.org")
|
||||
khoj-openai-api-key "YOUR_OPENAI_API_KEY" ; required to enable chat)
|
||||
#+end_src
|
||||
|
||||
** Use
|
||||
@@ -70,6 +100,15 @@
|
||||
|
||||
e.g "What is the meaning of life?", "My life goals for 2023"
|
||||
|
||||
*** Chat
|
||||
1. Hit ~C-c s c~ (or ~M-x khoj RET c~) to open khoj chat
|
||||
|
||||
2. Ask questions in a natural, conversational style
|
||||
|
||||
E.g "When did I file my taxes last year?"
|
||||
|
||||
See [[https://github.com/debanjum/khoj/tree/master/#Khoj-Chat][Khoj Chat]] for more details
|
||||
|
||||
*** Find Similar Entries
|
||||
This feature finds entries similar to the one you are currently on.
|
||||
1. Move cursor to the org-mode entry, markdown section or text paragraph you want to find similar entries for
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
;;; khoj.el --- Natural, Incremental Search for your Second Brain -*- lexical-binding: t -*-
|
||||
;;; khoj.el --- A search assistant for your second brain -*- lexical-binding: t -*-
|
||||
|
||||
;; Copyright (C) 2021-2022 Debanjum Singh Solanky
|
||||
|
||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||
;; Description: Natural, Incremental Search for your Second Brain
|
||||
;; Keywords: search, org-mode, outlines, markdown, beancount, ledger, image
|
||||
;; Version: 0.3.0
|
||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0"))
|
||||
;; Description: A search assistant for your second brain
|
||||
;; Keywords: search, chat, org-mode, outlines, markdown, beancount, image
|
||||
;; Version: 0.6.0
|
||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
||||
;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs
|
||||
|
||||
;; This file is NOT part of GNU Emacs.
|
||||
@@ -28,21 +28,24 @@
|
||||
|
||||
;;; Commentary:
|
||||
|
||||
;; This package provides a natural, incremental search interface to your
|
||||
;; `org-mode' notes, `markdown' files, `beancount' transactions and images.
|
||||
;; It is a wrapper that interfaces with the Khoj server.
|
||||
;; The server exposes an API for advanced search using transformer ML models.
|
||||
;; The Khoj server needs to be running to use this package.
|
||||
;; See the repository docs for detailed setup of the Khoj server.
|
||||
;; Create a search assistant for your `org-mode', `markdown' notes,
|
||||
;; `beancount' transactions and images. This package exposes two
|
||||
;; assistance modes, search and chat:
|
||||
;;
|
||||
;; Chat provides faster answers, iterative discovery and assisted
|
||||
;; creativity. It requires your OpenAI API key to access GPT models
|
||||
;;
|
||||
;; Search allows natural language, incremental and local search.
|
||||
;; It relies on AI models that run locally on your machine.
|
||||
;;
|
||||
;; Quickstart
|
||||
;; -------------
|
||||
;; 1. Install Khoj Server
|
||||
;; pip install khoj-assistant
|
||||
;; 2. Start, Configure Khoj Server
|
||||
;; khoj
|
||||
;; 3. Install khoj.el from MELPA Stable
|
||||
;; 1. Install khoj.el from MELPA Stable
|
||||
;; (use-package khoj :pin melpa-stable :bind ("C-c s" . 'khoj))
|
||||
;; 2. Start khoj from Emacs
|
||||
;; C-c s or M-x khoj
|
||||
;;
|
||||
;; See the repository docs for detailed setup and configuration steps.
|
||||
|
||||
;;; Code:
|
||||
|
||||
@@ -50,6 +53,9 @@
|
||||
(require 'json)
|
||||
(require 'transient)
|
||||
(require 'outline)
|
||||
(require 'dash)
|
||||
(require 'org)
|
||||
|
||||
(eval-when-compile (require 'subr-x)) ;; for string-trim before Emacs 28.2
|
||||
|
||||
|
||||
@@ -62,6 +68,11 @@
|
||||
:group 'khoj
|
||||
:type 'string)
|
||||
|
||||
(defcustom khoj-server-is-local t
|
||||
"Is Khoj server on local machine?."
|
||||
:group 'khoj
|
||||
:type 'boolean)
|
||||
|
||||
(defcustom khoj-image-width 156
|
||||
"Width of rendered images returned by Khoj."
|
||||
:group 'khoj
|
||||
@@ -97,12 +108,17 @@
|
||||
(defconst khoj--query-prompt "🦅Khoj: "
|
||||
"Query prompt shown in the minibuffer.")
|
||||
|
||||
(defconst khoj--buffer-name "*🦅Khoj*"
|
||||
"Name of buffer to show results from Khoj.")
|
||||
(defconst khoj--search-buffer-name "*🦅Khoj Search*"
|
||||
"Name of buffer to show search results from Khoj.")
|
||||
|
||||
(defconst khoj--chat-buffer-name "*🦅Khoj Chat*"
|
||||
"Name of chat buffer for Khoj.")
|
||||
|
||||
(defvar khoj--content-type "org"
|
||||
"The type of content to perform search on.")
|
||||
|
||||
(declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
|
||||
(declare-function org-element-type "org-mode" (ELEMENT))
|
||||
(declare-function beancount-mode "beancount" ())
|
||||
(declare-function markdown-mode "markdown-mode" ())
|
||||
(declare-function org-music-mode "org-music" ())
|
||||
@@ -128,6 +144,7 @@ NO-PAGING FILTER))
|
||||
"C-x M | music\n"))))
|
||||
|
||||
(defvar khoj--rerank nil "Track when re-rank of results triggered.")
|
||||
(defvar khoj--reference-count 0 "Track number of references currently in chat bufffer.")
|
||||
(defun khoj--search-markdown () "Set content-type to `markdown'." (interactive) (setq khoj--content-type "markdown"))
|
||||
(defun khoj--search-org () "Set content-type to `org-mode'." (interactive) (setq khoj--content-type "org"))
|
||||
(defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger"))
|
||||
@@ -162,6 +179,305 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
nil t t))
|
||||
(message "%s" (khoj--keybindings-info-message))))
|
||||
|
||||
|
||||
;; ----------------
|
||||
;; Khoj Setup
|
||||
;; ----------------
|
||||
(defcustom khoj-server-command
|
||||
(or (executable-find "khoj")
|
||||
(executable-find "khoj.exe")
|
||||
"khoj")
|
||||
"Command to interact with Khoj server."
|
||||
:type 'string
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-server-args '("--no-gui")
|
||||
"Arguments to pass to Khoj server on startup."
|
||||
:type '(repeat string)
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-server-python-command
|
||||
(if (equal system-type 'windows-nt)
|
||||
(or (executable-find "py")
|
||||
(executable-find "pythonw")
|
||||
"python")
|
||||
(if (executable-find "python")
|
||||
"python"
|
||||
;; Fallback on systems where python is not
|
||||
;; symlinked to python3.
|
||||
"python3"))
|
||||
"The Python interpreter used for the Khoj server.
|
||||
|
||||
Khoj will try to use the system interpreter if it exists. If you wish
|
||||
to use a specific python interpreter (from a virtual environment
|
||||
for example), set this to the full interpreter path."
|
||||
:type '(choice (const :tag "python" "python")
|
||||
(const :tag "python3" "python3")
|
||||
(const :tag "pythonw (Python on Windows)" "pythonw")
|
||||
(const :tag "py (other Python on Windows)" "py")
|
||||
(string :tag "Other"))
|
||||
:safe (lambda (val)
|
||||
(member val '("python" "python3" "pythonw" "py")))
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-org-files (org-agenda-files t t)
|
||||
"List of org-files to index on khoj server."
|
||||
:type '(repeat string)
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-org-directories nil
|
||||
"List of directories with org-mode files to index on khoj server."
|
||||
:type '(repeat string)
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-openai-api-key nil
|
||||
"OpenAI API key used to configure chat on khoj server."
|
||||
:type 'string
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-auto-setup t
|
||||
"Automate install, configure and start of khoj server.
|
||||
Auto invokes setup steps on calling main entrypoint."
|
||||
:type 'string
|
||||
:group 'khoj)
|
||||
|
||||
(defvar khoj--server-process nil "Track khoj server process.")
|
||||
(defvar khoj--server-name "*khoj-server*" "Track khoj server buffer.")
|
||||
(defvar khoj--server-ready? nil "Track if khoj server is ready to receive API calls.")
|
||||
(defvar khoj--server-configured? t "Track if khoj server is configured to receive API calls.")
|
||||
(defvar khoj--progressbar '(🌑 🌘 🌗 🌖 🌕 🌔 🌓 🌒) "Track progress via moon phase animations.")
|
||||
|
||||
(defun khoj--server-get-version ()
|
||||
"Return the khoj server version."
|
||||
(with-temp-buffer
|
||||
(call-process khoj-server-command nil t nil "--version")
|
||||
(goto-char (point-min))
|
||||
(re-search-forward "\\([a-z0-9.]+\\)")
|
||||
(match-string 1)))
|
||||
|
||||
(defun khoj--server-install-upgrade ()
|
||||
"Install or upgrade the khoj server."
|
||||
(with-temp-buffer
|
||||
(message "khoj.el: Installing server...")
|
||||
(if (/= (apply 'call-process khoj-server-python-command
|
||||
nil t nil
|
||||
"-m" "pip" "install" "--upgrade"
|
||||
'("khoj-assistant"))
|
||||
0)
|
||||
(message "khoj.el: Failed to install Khoj server. Please install it manually using pip install `khoj-assistant'.\n%s" (buffer-string))
|
||||
(message "khoj.el: Installed and upgraded Khoj server version: %s" (khoj--server-get-version)))))
|
||||
|
||||
(defun khoj--server-start ()
|
||||
"Start the khoj server."
|
||||
(interactive)
|
||||
(let* ((url-parts (split-string (cadr (split-string khoj-server-url "://")) ":"))
|
||||
(server-host (nth 0 url-parts))
|
||||
(server-port (or (nth 1 url-parts) "80"))
|
||||
(server-args (append khoj-server-args
|
||||
(list (format "--host=%s" server-host)
|
||||
(format "--port=%s" server-port)))))
|
||||
(message "khoj.el: Starting server at %s %s..." server-host server-port)
|
||||
(setq khoj--server-process
|
||||
(make-process
|
||||
:name khoj--server-name
|
||||
:buffer khoj--server-name
|
||||
:command (append (list khoj-server-command) server-args)
|
||||
:sentinel (lambda (process event)
|
||||
(message "khoj.el: khoj server stopped with: %s" event)
|
||||
(setq khoj--server-ready? nil))
|
||||
:filter (lambda (process msg)
|
||||
(cond ((string-match (format "Uvicorn running on %s" khoj-server-url) msg)
|
||||
(progn
|
||||
(setq khoj--server-ready? t)
|
||||
(khoj--server-configure)))
|
||||
((string-match "Batches: " msg)
|
||||
(when (string-match "\\([0-9]+\\.[0-9]+\\|\\([0-9]+\\)\\)%?" msg)
|
||||
(message "khoj.el: %s updating index %s"
|
||||
(nth (% (string-to-number (match-string 1 msg)) (length khoj--progressbar)) khoj--progressbar)
|
||||
(match-string 0 msg)))
|
||||
(setq khoj--server-configured? nil))
|
||||
((and (not khoj--server-configured?)
|
||||
(string-match "Processor reconfigured via API" msg))
|
||||
(setq khoj--server-configured? t))
|
||||
((and (not khoj--server-ready?)
|
||||
(or (string-match "configure.py" msg)
|
||||
(string-match "main.py" msg)
|
||||
(string-match "api.py" msg)))
|
||||
(dolist (line (split-string msg "\n"))
|
||||
(message "khoj.el: %s" (nth 1 (split-string msg " " t " *"))))))
|
||||
;; call default process filter to write output to process buffer
|
||||
(internal-default-process-filter process msg))))
|
||||
(set-process-query-on-exit-flag khoj--server-process nil)
|
||||
(when (not khoj--server-process)
|
||||
(message "khoj.el: Failed to start Khoj server. Please start it manually by running `khoj' on terminal.\n%s" (buffer-string)))))
|
||||
|
||||
(defun khoj--server-started? ()
|
||||
"Check if the khoj server has been started."
|
||||
;; check for when server process handled from within emacs
|
||||
(if (and khoj--server-process
|
||||
(not (null (process-live-p khoj--server-process))))
|
||||
t
|
||||
;; else general check via ping to khoj-server-url
|
||||
(if (ignore-errors
|
||||
(not (null (url-retrieve-synchronously (format "%s/api/config/data/default" khoj-server-url)))))
|
||||
;; Successful ping to non-emacs khoj server indicates it is started and ready.
|
||||
;; So update ready state tracker variable (and implicitly return true for started)
|
||||
(setq khoj--server-ready? t)
|
||||
nil)))
|
||||
|
||||
(defun khoj--server-restart ()
|
||||
"Restart the khoj server."
|
||||
(interactive)
|
||||
(khoj--server-stop)
|
||||
(khoj--server-start))
|
||||
|
||||
(defun khoj--server-stop ()
|
||||
"Stop the khoj server."
|
||||
(interactive)
|
||||
(when (khoj--server-started?)
|
||||
(message "khoj.el: Stopping server...")
|
||||
(kill-process khoj--server-process)
|
||||
(message "khoj.el: Stopped server.")))
|
||||
|
||||
(defun khoj--server-setup ()
|
||||
"Install and start the khoj server, if required."
|
||||
(interactive)
|
||||
;; Install khoj server, if not available but expected on local machine
|
||||
(when (and khoj-server-is-local
|
||||
(or (not (executable-find khoj-server-command))
|
||||
(not (khoj--server-get-version))))
|
||||
(khoj--server-install-upgrade))
|
||||
;; Start khoj server if not already started
|
||||
(when (not (khoj--server-started?))
|
||||
(khoj--server-start)))
|
||||
|
||||
(defun khoj--get-directory-from-config (config keys &optional level)
|
||||
"Extract directory under specified KEYS in CONFIG and trim it to LEVEL.
|
||||
CONFIG is json obtained from Khoj config API."
|
||||
(let ((item config))
|
||||
(dolist (key keys)
|
||||
(setq item (cdr (assoc key item))))
|
||||
(-> item
|
||||
(split-string "/")
|
||||
(butlast (or level nil))
|
||||
(string-join "/"))))
|
||||
|
||||
(defun khoj--server-configure ()
|
||||
"Configure the the Khoj server for search and chat."
|
||||
(interactive)
|
||||
(let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null))
|
||||
(current-config
|
||||
(with-temp-buffer
|
||||
(url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
|
||||
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
|
||||
(default-config
|
||||
(with-temp-buffer
|
||||
(url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url))
|
||||
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
|
||||
(default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
|
||||
(default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
|
||||
(default-model (or (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config))) "text-davinci-003"))
|
||||
(config (or current-config default-config)))
|
||||
|
||||
;; Configure content types
|
||||
(cond
|
||||
;; If khoj backend is not configured yet
|
||||
((not current-config)
|
||||
(setq config (delq (assoc 'content-type config) config))
|
||||
(add-to-list 'config
|
||||
`(content-type . ((org . ((input-files . ,khoj-org-files)
|
||||
(input-filter . ,org-directory-regexes)
|
||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
|
||||
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
|
||||
(index-heading-entries . ,json-false)))))))
|
||||
|
||||
;; Else if khoj config has no org content config
|
||||
((not (alist-get 'org (alist-get 'content-type config)))
|
||||
(let ((new-content-type (alist-get 'content-type config)))
|
||||
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
|
||||
(add-to-list 'new-content-type `(org . ((input-files . ,khoj-org-files)
|
||||
(input-filter . ,org-directory-regexes)
|
||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
|
||||
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
|
||||
(index-heading-entries . ,json-false))))
|
||||
(setq config (delq (assoc 'content-type config) config))
|
||||
(add-to-list 'config `(content-type . ,new-content-type))))
|
||||
|
||||
;; Else if khoj is not configured to index specified org files
|
||||
((not (and (equal (alist-get 'input-files (alist-get 'org (alist-get 'content-type config))) khoj-org-files)
|
||||
(equal (alist-get 'input-filter (alist-get 'org (alist-get 'content-type config))) org-directory-regexes)))
|
||||
(let* ((index-directory (khoj--get-directory-from-config config '(content-type org embeddings-file)))
|
||||
(new-content-type (alist-get 'content-type config)))
|
||||
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
|
||||
(add-to-list 'new-content-type `(org . ((input-files . ,khoj-org-files)
|
||||
(input-filter . ,org-directory-regexes)
|
||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" index-directory))
|
||||
(embeddings-file . ,(format "%s/org.pt" index-directory))
|
||||
(index-heading-entries . ,json-false))))
|
||||
(setq config (delq (assoc 'content-type config) config))
|
||||
(add-to-list 'config `(content-type . ,new-content-type)))))
|
||||
|
||||
;; Configure processors
|
||||
(cond
|
||||
((not khoj-openai-api-key)
|
||||
(setq config (delq (assoc 'processor config) config)))
|
||||
|
||||
((not current-config)
|
||||
(setq config (delq (assoc 'processor config) config))
|
||||
(add-to-list 'config
|
||||
`(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||
(model . ,default-model)
|
||||
(openai-api-key . ,khoj-openai-api-key)))))))
|
||||
|
||||
((not (alist-get 'conversation (alist-get 'processor config)))
|
||||
(let ((new-processor-type (alist-get 'processor config)))
|
||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||
(add-to-list 'new-processor-type `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||
(model . ,default-model)
|
||||
(openai-api-key . ,khoj-openai-api-key))))
|
||||
(setq config (delq (assoc 'processor config) config))
|
||||
(add-to-list 'config `(processor . ,new-processor-type))))
|
||||
|
||||
;; Else if khoj is not configured with specified openai api key
|
||||
((not (equal (alist-get 'openai-api-key (alist-get 'conversation (alist-get 'processor config))) khoj-openai-api-key))
|
||||
(let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
|
||||
(model-name (khoj--get-directory-from-config config '(processor conversation model)))
|
||||
(new-processor-type (alist-get 'processor config)))
|
||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||
(add-to-list 'new-processor-type `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
|
||||
(model . ,model-name)
|
||||
(openai-api-key . ,khoj-openai-api-key))))
|
||||
(setq config (delq (assoc 'processor config) config))
|
||||
(add-to-list 'config `(processor . ,new-processor-type)))))
|
||||
|
||||
;; Update server with latest configuration
|
||||
(khoj--post-new-config config)
|
||||
(cond ((not current-config)
|
||||
(message "khoj.el: ⚙️ Generated new khoj server configuration."))
|
||||
((not (equal config current-config))
|
||||
(message "Khoj: ⚙️ Updated khoj server configuration")))))
|
||||
|
||||
(defun khoj-setup (&optional interact)
|
||||
"Install, start and configure Khoj server."
|
||||
(interactive "p")
|
||||
;; Setup khoj server if not running
|
||||
(let* ((not-started (not (khoj--server-started?)))
|
||||
(permitted (if (and not-started interact)
|
||||
(y-or-n-p "Could not connect to Khoj server. Should I install, start and configure it for you?")
|
||||
t)))
|
||||
;; Install, start server if user permitted and server not ready
|
||||
(when (and permitted not-started)
|
||||
(khoj--server-setup))
|
||||
|
||||
;; Server can be started but not ready (to use/configure)
|
||||
;; Wait until server is ready if setup was permitted
|
||||
(while (and permitted (not khoj--server-ready?))
|
||||
(sit-for 0.5))
|
||||
|
||||
;; Configure server once server ready if user permitted
|
||||
(when permitted
|
||||
(khoj--server-configure))))
|
||||
|
||||
|
||||
;; -----------------------------------------------
|
||||
;; Extract and Render Entries of each Content Type
|
||||
@@ -187,22 +503,21 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
|
||||
(defun khoj--extract-entries-as-org (json-response query)
|
||||
"Convert JSON-RESPONSE, QUERY from API to `org-mode' entries."
|
||||
(let ((org-results-buffer-format-str "* %s\n%s\n#+STARTUP: showall hidestars inlineimages"))
|
||||
(thread-last
|
||||
json-response
|
||||
;; Extract and render each org-mode entry from response
|
||||
(mapcar (lambda (json-response-item)
|
||||
(thread-last
|
||||
;; Extract org entry from each item in json response
|
||||
(cdr (assoc 'entry json-response-item))
|
||||
;; Format org entry as a string
|
||||
(format "%s")
|
||||
;; Standardize results to 2nd level heading for consistent rendering
|
||||
(replace-regexp-in-string "^\*+" "**"))))
|
||||
;; Render entries into org formatted string with query set as as top level heading
|
||||
(format org-results-buffer-format-str query)
|
||||
;; remove leading (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string "^[\(\) ]" ""))))
|
||||
(thread-last
|
||||
json-response
|
||||
;; Extract and render each org-mode entry from response
|
||||
(mapcar (lambda (json-response-item)
|
||||
(thread-last
|
||||
;; Extract org entry from each item in json response
|
||||
(cdr (assoc 'entry json-response-item))
|
||||
;; Format org entry as a string
|
||||
(format "%s")
|
||||
;; Standardize results to 2nd level heading for consistent rendering
|
||||
(replace-regexp-in-string "^\*+" "**"))))
|
||||
;; Render entries into org formatted string with query set as as top level heading
|
||||
(format "* %s\n%s\n" query)
|
||||
;; remove leading (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string "^[\(\) ]" "")))
|
||||
|
||||
(defun khoj--extract-entries-as-ledger (json-response query)
|
||||
"Convert JSON-RESPONSE, QUERY from API to ledger entries."
|
||||
@@ -244,6 +559,18 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
;; remove trailing (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string "[\(\) ]$" ""))))
|
||||
|
||||
(defun khoj--extract-entries (json-response query)
|
||||
"Convert JSON-RESPONSE, QUERY from API to text entries."
|
||||
(thread-last json-response
|
||||
;; extract and render entries from API response
|
||||
(mapcar (lambda (args) (format "%s\n\n" (cdr (assoc 'entry args)))))
|
||||
;; Set query as heading in rendered results buffer
|
||||
(format "# Query: %s\n\n%s\n" query)
|
||||
;; remove leading (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string "^[\(\) ]" "")
|
||||
;; remove trailing (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string "[\(\) ]$" "")))
|
||||
|
||||
(defun khoj--buffer-name-to-content-type (buffer-name)
|
||||
"Infer content type based on BUFFER-NAME."
|
||||
(let ((enabled-content-types (khoj--get-enabled-content-types))
|
||||
@@ -260,30 +587,39 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
;; Query Khoj API
|
||||
;; --------------
|
||||
|
||||
(defun khoj--post-new-config (config)
|
||||
"Configure khoj server with provided CONFIG."
|
||||
;; POST provided config to khoj server
|
||||
(let ((url-request-method "POST")
|
||||
(url-request-extra-headers '(("Content-Type" . "application/json")))
|
||||
(url-request-data (json-encode-alist config))
|
||||
(config-url (format "%s/api/config/data" khoj-server-url)))
|
||||
(with-current-buffer (url-retrieve-synchronously config-url)
|
||||
(buffer-string)))
|
||||
;; Update index on khoj server after configuration update
|
||||
(let ((khoj--server-ready? nil))
|
||||
(url-retrieve (format "%s/api/update?t=org" khoj-server-url) #'identity)))
|
||||
|
||||
(defun khoj--get-enabled-content-types ()
|
||||
"Get content types enabled for search from API."
|
||||
(let ((config-url (format "%s/api/config/data" khoj-server-url))
|
||||
(let ((config-url (format "%s/api/config/types" khoj-server-url))
|
||||
(url-request-method "GET"))
|
||||
(with-temp-buffer
|
||||
(erase-buffer)
|
||||
(url-insert-file-contents config-url)
|
||||
(let* ((json-response (json-parse-buffer :object-type 'alist))
|
||||
(content-type (cdr (assoc 'content-type json-response))))
|
||||
;; return content-type items with configuration
|
||||
(mapcar
|
||||
#'car
|
||||
(cl-remove-if-not
|
||||
(lambda (a) (not (eq (cdr a) :null)))
|
||||
content-type))))))
|
||||
(thread-last
|
||||
(json-parse-buffer :object-type 'alist)
|
||||
(mapcar #'intern)))))
|
||||
|
||||
(defun khoj--construct-api-query (query content-type &optional rerank)
|
||||
"Construct API Query from QUERY, CONTENT-TYPE and (optional) RERANK params."
|
||||
(defun khoj--construct-search-api-query (query content-type &optional rerank)
|
||||
"Construct Search API Query.
|
||||
Use QUERY, CONTENT-TYPE and (optional) RERANK as query params"
|
||||
(let ((rerank (or rerank "false"))
|
||||
(encoded-query (url-hexify-string query)))
|
||||
(format "%s/api/search?q=%s&t=%s&r=%s&n=%s" khoj-server-url encoded-query content-type rerank khoj-results-count)))
|
||||
|
||||
(defun khoj--query-api-and-render-results (query-url content-type query buffer-name)
|
||||
"Query Khoj QUERY-URL. Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
|
||||
(defun khoj--query-search-api-and-render-results (query-url content-type query buffer-name)
|
||||
"Query Khoj Search with QUERY-URL.
|
||||
Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
|
||||
;; get json response from api
|
||||
(with-current-buffer buffer-name
|
||||
(let ((inhibit-read-only t)
|
||||
@@ -300,9 +636,14 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query))
|
||||
((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query))
|
||||
((equal content-type "image") (khoj--extract-entries-as-images json-response query))
|
||||
(t (format "%s" json-response))))
|
||||
(cond ((equal content-type "org") (progn (org-mode)
|
||||
(visual-line-mode)))
|
||||
(t (khoj--extract-entries json-response query))))
|
||||
(cond ((equal content-type "org") (progn (visual-line-mode)
|
||||
(org-mode)
|
||||
(setq-local
|
||||
org-startup-folded "showall"
|
||||
org-hide-leading-stars t
|
||||
org-startup-with-inline-images t)
|
||||
(org-set-startup-visibility)))
|
||||
((equal content-type "markdown") (progn (markdown-mode)
|
||||
(visual-line-mode)))
|
||||
((equal content-type "ledger") (beancount-mode))
|
||||
@@ -313,6 +654,163 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
(t (fundamental-mode))))
|
||||
(read-only-mode t)))
|
||||
|
||||
|
||||
;; ----------------
|
||||
;; Khoj Chat
|
||||
;; ----------------
|
||||
|
||||
(defun khoj--chat ()
|
||||
"Chat with Khoj."
|
||||
(interactive)
|
||||
(when (not (get-buffer khoj--chat-buffer-name))
|
||||
(khoj--load-chat-history khoj--chat-buffer-name))
|
||||
(switch-to-buffer khoj--chat-buffer-name)
|
||||
(let ((query (read-string "Query: ")))
|
||||
(when (not (string-empty-p query))
|
||||
(khoj--query-chat-api-and-render-messages query khoj--chat-buffer-name))))
|
||||
|
||||
(defun khoj--load-chat-history (buffer-name)
|
||||
"Load Khoj Chat conversation history into BUFFER-NAME."
|
||||
(let ((json-response (cdr (assoc 'response (khoj--query-chat-api "")))))
|
||||
(with-current-buffer (get-buffer-create buffer-name)
|
||||
(erase-buffer)
|
||||
(insert "* Khoj Chat\n")
|
||||
(thread-last
|
||||
json-response
|
||||
;; generate chat messages from Khoj Chat API response
|
||||
(mapcar #'khoj--render-chat-response)
|
||||
;; insert chat messages into Khoj Chat Buffer
|
||||
(mapc #'insert))
|
||||
(progn
|
||||
(org-mode)
|
||||
(khoj--add-hover-text-to-footnote-refs (point-min))
|
||||
|
||||
;; render reference footnotes as superscript
|
||||
(setq-local
|
||||
org-startup-folded "showall"
|
||||
org-hide-leading-stars t
|
||||
org-use-sub-superscripts '{}
|
||||
org-pretty-entities-include-sub-superscripts t
|
||||
org-pretty-entities t)
|
||||
(org-set-startup-visibility)
|
||||
|
||||
;; create khoj chat shortcut keybindings
|
||||
(use-local-map (copy-keymap org-mode-map))
|
||||
(local-set-key (kbd "m") #'khoj--chat)
|
||||
(local-set-key (kbd "C-x m") #'khoj--chat)
|
||||
|
||||
;; enable minor modes for khoj chat
|
||||
(visual-line-mode)
|
||||
(read-only-mode t)))))
|
||||
|
||||
(defun khoj--add-hover-text-to-footnote-refs (start-pos)
|
||||
"Show footnote defs on mouse hover on footnote refs from START-POS."
|
||||
(org-with-wide-buffer
|
||||
(goto-char start-pos)
|
||||
(while (re-search-forward org-footnote-re nil t)
|
||||
(backward-char)
|
||||
(let* ((context (org-element-context))
|
||||
(label (org-element-property :label context))
|
||||
(footnote-def (nth 3 (org-footnote-get-definition label)))
|
||||
(footnote-width (if (< (length footnote-def) 70) nil 70))
|
||||
(begin-pos (org-element-property :begin context))
|
||||
(end-pos (org-element-property :end context))
|
||||
(overlay (make-overlay begin-pos end-pos)))
|
||||
(when (memq (org-element-type context)
|
||||
'(footnote-reference))
|
||||
(-->
|
||||
footnote-def
|
||||
;; truncate footnote definition if required
|
||||
(substring it 0 footnote-width)
|
||||
;; append continuation suffix if truncated
|
||||
(concat it (if footnote-width "..." ""))
|
||||
;; show definition on hover on footnote reference
|
||||
(overlay-put overlay 'help-echo it)))))))
|
||||
|
||||
(defun khoj--query-chat-api-and-render-messages (query buffer-name)
|
||||
"Send QUERY to Khoj Chat. Render the chat messages from exchange in BUFFER-NAME."
|
||||
;; render json response into formatted chat messages
|
||||
(with-current-buffer (get-buffer buffer-name)
|
||||
(let ((inhibit-read-only t)
|
||||
(new-content-start-pos (point-max))
|
||||
(query-time (format-time-string "%F %T"))
|
||||
(json-response (khoj--query-chat-api query)))
|
||||
(goto-char new-content-start-pos)
|
||||
(insert
|
||||
(khoj--render-chat-message query "you" query-time)
|
||||
(khoj--render-chat-response json-response))
|
||||
(khoj--add-hover-text-to-footnote-refs new-content-start-pos))
|
||||
(progn
|
||||
(org-set-startup-visibility)
|
||||
(visual-line-mode)
|
||||
(re-search-backward "^\*+ 🦅" nil t))))
|
||||
|
||||
(defun khoj--query-chat-api (query)
|
||||
"Send QUERY to Khoj Chat API."
|
||||
(let* ((url-request-method "GET")
|
||||
(encoded-query (url-hexify-string query))
|
||||
(query-url (format "%s/api/chat?q=%s" khoj-server-url encoded-query)))
|
||||
(with-temp-buffer
|
||||
(condition-case ex
|
||||
(progn
|
||||
(url-insert-file-contents query-url)
|
||||
(json-parse-buffer :object-type 'alist))
|
||||
('file-error (cond ((string-match "Internal server error" (nth 2 ex))
|
||||
(message "Chat processor not configured. Configure OpenAI API key and restart it. Exception: [%s]" ex))
|
||||
(t (message "Chat exception: [%s]" ex))))))))
|
||||
|
||||
|
||||
(defun khoj--render-chat-message (message sender &optional receive-date)
|
||||
"Render chat messages as `org-mode' list item.
|
||||
MESSAGE is the text of the chat message.
|
||||
SENDER is the message sender.
|
||||
RECEIVE-DATE is the message receive date."
|
||||
(let ((first-message-line (car (split-string message "\n" t)))
|
||||
(rest-message-lines (string-join (cdr (split-string message "\n" t)) "\n"))
|
||||
(heading-level (if (equal sender "you") "**" "***"))
|
||||
(emojified-sender (if (equal sender "you") "🤔 *You*" "🦅 *Khoj*"))
|
||||
(suffix-newlines (if (equal sender "khoj") "\n\n" ""))
|
||||
(received (or receive-date (format-time-string "%F %T"))))
|
||||
(format "%s %s: %s\n :PROPERTIES:\n :RECEIVED: [%s]\n :END:\n%s\n%s"
|
||||
heading-level
|
||||
emojified-sender
|
||||
first-message-line
|
||||
received
|
||||
rest-message-lines
|
||||
suffix-newlines)))
|
||||
|
||||
(defun khoj--generate-reference (reference)
|
||||
"Create `org-mode' footnotes with REFERENCE."
|
||||
(setq khoj--reference-count (1+ khoj--reference-count))
|
||||
(cons
|
||||
(propertize (format "^{ [fn:%x]}" khoj--reference-count) 'help-echo reference)
|
||||
(thread-last
|
||||
reference
|
||||
(replace-regexp-in-string "\n\n" "\n")
|
||||
(format "\n[fn:%x] %s" khoj--reference-count))))
|
||||
|
||||
(defun khoj--render-chat-response (json-response)
|
||||
"Render chat message using JSON-RESPONSE from Khoj Chat API."
|
||||
(let* ((message (cdr (or (assoc 'response json-response) (assoc 'message json-response))))
|
||||
(sender (cdr (assoc 'by json-response)))
|
||||
(receive-date (cdr (assoc 'created json-response)))
|
||||
(references (or (cdr (assoc 'context json-response)) '()))
|
||||
(footnotes (mapcar #'khoj--generate-reference references))
|
||||
(footnote-links (mapcar #'car footnotes))
|
||||
(footnote-defs (mapcar #'cdr footnotes)))
|
||||
(thread-first
|
||||
;; concatenate khoj message and references from API
|
||||
(concat
|
||||
message
|
||||
;; append reference links to khoj message
|
||||
(string-join footnote-links "")
|
||||
;; append reference sub-section to khoj message and fold it
|
||||
(if footnote-defs "\n**** References\n:PROPERTIES:\n:VISIBILITY: folded\n:END:" "")
|
||||
;; append reference definitions to references subsection
|
||||
(string-join footnote-defs " "))
|
||||
;; Render chat message using data obtained from API
|
||||
(khoj--render-chat-message sender receive-date))))
|
||||
|
||||
|
||||
;; ------------------
|
||||
;; Incremental Search
|
||||
@@ -321,9 +819,9 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
(defun khoj--incremental-search (&optional rerank)
|
||||
"Perform Incremental Search on Khoj. Allow optional RERANK of results."
|
||||
(let* ((rerank-str (cond (rerank "true") (t "false")))
|
||||
(khoj-buffer-name (get-buffer-create khoj--buffer-name))
|
||||
(khoj-buffer-name (get-buffer-create khoj--search-buffer-name))
|
||||
(query (minibuffer-contents-no-properties))
|
||||
(query-url (khoj--construct-api-query query khoj--content-type rerank-str)))
|
||||
(query-url (khoj--construct-search-api-query query khoj--content-type rerank-str)))
|
||||
;; Query khoj API only when user in khoj minibuffer and non-empty query
|
||||
;; Prevents querying if
|
||||
;; 1. user hasn't started typing query
|
||||
@@ -342,7 +840,7 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
(when rerank
|
||||
(setq khoj--rerank t)
|
||||
(message "Khoj: Rerank Results"))
|
||||
(khoj--query-api-and-render-results
|
||||
(khoj--query-search-api-and-render-results
|
||||
query-url
|
||||
khoj--content-type
|
||||
query
|
||||
@@ -370,7 +868,7 @@ Use `which-key` if available, else display simple message in echo area"
|
||||
(defun khoj-incremental ()
|
||||
"Natural, Incremental Search for your personal notes, transactions and music."
|
||||
(interactive)
|
||||
(let* ((khoj-buffer-name (get-buffer-create khoj--buffer-name)))
|
||||
(let* ((khoj-buffer-name (get-buffer-create khoj--search-buffer-name)))
|
||||
;; switch to khoj results buffer
|
||||
(switch-to-buffer khoj-buffer-name)
|
||||
;; open and setup minibuffer for incremental search
|
||||
@@ -435,14 +933,14 @@ Paragraph only starts at first text after blank line."
|
||||
;; get paragraph, if in text mode
|
||||
(t
|
||||
(khoj--get-current-paragraph-text))))
|
||||
(query-url (khoj--construct-api-query query content-type rerank))
|
||||
(query-url (khoj--construct-search-api-query query content-type rerank))
|
||||
;; extract heading to show in result buffer from query
|
||||
(query-title
|
||||
(format "Similar to: %s"
|
||||
(replace-regexp-in-string "^[#\\*]* " "" (car (split-string query "\n")))))
|
||||
(buffer-name (get-buffer-create khoj--buffer-name)))
|
||||
(buffer-name (get-buffer-create khoj--search-buffer-name)))
|
||||
(progn
|
||||
(khoj--query-api-and-render-results
|
||||
(khoj--query-search-api-and-render-results
|
||||
query-url
|
||||
content-type
|
||||
query-title
|
||||
@@ -496,15 +994,20 @@ Paragraph only starts at first text after blank line."
|
||||
(setq khoj--content-type content-type)
|
||||
(url-retrieve update-url (lambda (_) (message "Khoj %s index %supdated!" content-type (if (member "--force-update" args) "force " "")))))))
|
||||
|
||||
(transient-define-prefix khoj-menu ()
|
||||
(transient-define-suffix khoj--chat-command (&optional _)
|
||||
"Command to Chat with Khoj."
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(khoj--chat))
|
||||
|
||||
(transient-define-prefix khoj--menu ()
|
||||
"Create Khoj Menu to Configure and Execute Commands."
|
||||
[["Configure General"
|
||||
[["Configure Search"
|
||||
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
|
||||
("t" "Content Type" khoj--content-type-switch)]
|
||||
["Configure Search"
|
||||
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))]
|
||||
["Configure Update"
|
||||
("-f" "Force Update" "--force-update")]]
|
||||
[["Act"
|
||||
("c" "Chat" khoj--chat-command)
|
||||
("s" "Search" khoj--search-command)
|
||||
("f" "Find Similar" khoj--find-similar-command)
|
||||
("u" "Update" khoj--update-command)
|
||||
@@ -517,9 +1020,11 @@ Paragraph only starts at first text after blank line."
|
||||
|
||||
;;;###autoload
|
||||
(defun khoj ()
|
||||
"Natural, Incremental Search for your personal notes, transactions and images."
|
||||
"Provide natural, search assistance for your notes, transactions and images."
|
||||
(interactive)
|
||||
(khoj-menu))
|
||||
(when khoj-auto-setup
|
||||
(khoj-setup t))
|
||||
(khoj--menu))
|
||||
|
||||
(provide 'khoj)
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||
;; Version: 0.0.0
|
||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0"))
|
||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1") (org "9.0.0"))
|
||||
;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs
|
||||
|
||||
;;; License:
|
||||
@@ -28,8 +28,10 @@
|
||||
|
||||
;;; Code:
|
||||
|
||||
(require 'dash)
|
||||
(require 'ert)
|
||||
(require 'khoj)
|
||||
(require 'org)
|
||||
|
||||
|
||||
|
||||
@@ -107,8 +109,7 @@ Penance to Immortality\n\
|
||||
** Act\n\
|
||||
\n\
|
||||
Rule everything\n\
|
||||
\n\
|
||||
#+STARTUP: showall hidestars inlineimages"))))
|
||||
\n"))))
|
||||
|
||||
|
||||
(ert-deftest khoj-tests--extract-entries-as-ledger ()
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
- [Setup Plugin](#2-Setup-Plugin)
|
||||
- [Use](#Use)
|
||||
- [Search](#search)
|
||||
- [Chat](#chat)
|
||||
- [Find Similar Notes](#find-similar-notes)
|
||||
- [Upgrade](#Upgrade)
|
||||
- [Upgrade Backend](#1-Upgrade-Backend)
|
||||
@@ -21,9 +22,14 @@
|
||||
- [Implementation](#Implementation)
|
||||
|
||||
## Features
|
||||
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
||||
- **Local**: Your personal data stays local. All search, indexing is done on your machine[\*](https://github.com/debanjum/khoj#miscellaneous)
|
||||
- **Incremental**: Incremental search for a fast, search-as-you-type experience
|
||||
- **Search**
|
||||
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
||||
- **Local**: Your personal data stays local. All search and indexing is done on your machine. *Unlike chat which requires access to GPT.*
|
||||
- **Incremental**: Incremental search for a fast, search-as-you-type experience
|
||||
- **Chat**
|
||||
- **Faster answers**: Find answers faster and with less effort than search
|
||||
- **Iterative discovery**: Iteratively explore and (re-)discover your notes
|
||||
- **Assisted creativity**: Smoothly weave across answers retrieval and content generation
|
||||
|
||||
## Demo
|
||||
https://user-images.githubusercontent.com/6413477/210486007-36ee3407-e6aa-4185-8a26-b0bfc0a4344f.mp4
|
||||
@@ -44,7 +50,7 @@ https://user-images.githubusercontent.com/6413477/210486007-36ee3407-e6aa-4185-8
|
||||

|
||||
|
||||
## Setup
|
||||
- *Make sure [python](https://realpython.com/installing-python/) and [pip](https://pip.pypa.io/en/stable/installation/) are installed on your machine*
|
||||
- *Make sure [python](https://realpython.com/installing-python/) (version 3.10 or lower) and [pip](https://pip.pypa.io/en/stable/installation/) are installed on your machine*
|
||||
- *Ensure you follow the ordering of the setup steps. Install the plugin after starting the khoj backend. This allows the plugin to configure the khoj backend*
|
||||
|
||||
### 1. Setup Backend
|
||||
@@ -55,10 +61,23 @@ pip install khoj-assistant && khoj --no-gui
|
||||
### 2. Setup Plugin
|
||||
1. Open [Khoj](https://obsidian.md/plugins?id=khoj) from the *Community plugins* tab in Obsidian settings panel
|
||||
2. Click *Install*, then *Enable* on the Khoj plugin page in Obsidian
|
||||
3. [Optional] To enable Khoj Chat, set your [OpenAI API key](https://platform.openai.com/account/api-keys) in the Khoj plugin settings
|
||||
|
||||
See [official Obsidian plugin docs](https://help.obsidian.md/Extending+Obsidian/Community+plugins) for details
|
||||
|
||||
## Use
|
||||
### Chat
|
||||
Run *Khoj: Chat* from the [Command Palette](https://help.obsidian.md/Plugins/Command+palette) and ask questions in a natural, conversational style.<br />
|
||||
E.g "When did I file my taxes last year?"
|
||||
|
||||
Notes:
|
||||
- *Using Khoj Chat will result in query relevant notes being shared with OpenAI for ChatGPT to respond.*
|
||||
- *To use Khoj Chat, ensure you've set your [OpenAI API key](https://platform.openai.com/account/api-keys) in the Khoj plugin settings.*
|
||||
|
||||
See [Khoj Chat](https://github.com/debanjum/khoj/tree/master/#Khoj-Chat) for more details
|
||||
|
||||

|
||||
|
||||
### Search
|
||||
Click the *Khoj search* icon 🔎 on the [Ribbon](https://help.obsidian.md/User+interface/Workspace/Ribbon) or run *Khoj: Search* from the [Command Palette](https://help.obsidian.md/Plugins/Command+palette)
|
||||
|
||||
|
||||
BIN
src/interface/obsidian/docs/khoj_chat_on_obsidian_0.6.0.png
Normal file
BIN
src/interface/obsidian/docs/khoj_chat_on_obsidian_0.6.0.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 277 KiB |
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "khoj",
|
||||
"name": "Khoj",
|
||||
"version": "0.3.0",
|
||||
"version": "0.6.0",
|
||||
"minAppVersion": "0.15.0",
|
||||
"description": "Natural, Incremental Search for your Second Brain 🦅",
|
||||
"author": "Debanjum Singh Solanky",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "Khoj",
|
||||
"version": "0.3.0",
|
||||
"version": "0.6.0",
|
||||
"description": "Natural, Incremental Search for your Second Brain 🦅",
|
||||
"main": "src/main.js",
|
||||
"scripts": {
|
||||
|
||||
130
src/interface/obsidian/src/chat_modal.ts
Normal file
130
src/interface/obsidian/src/chat_modal.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
import { App, Modal, request, Setting } from 'obsidian';
|
||||
import { KhojSetting } from 'src/settings';
|
||||
|
||||
|
||||
export class KhojChatModal extends Modal {
|
||||
result: string;
|
||||
setting: KhojSetting;
|
||||
|
||||
constructor(app: App, setting: KhojSetting) {
|
||||
super(app);
|
||||
this.setting = setting;
|
||||
|
||||
// Register Modal Keybindings to send user message
|
||||
this.scope.register([], 'Enter', async () => {
|
||||
// Get text in chat input elmenet
|
||||
let input_el = <HTMLInputElement>this.contentEl.getElementsByClassName("khoj-chat-input")[0];
|
||||
|
||||
// Clear text after extracting message to send
|
||||
let user_message = input_el.value;
|
||||
input_el.value = "";
|
||||
|
||||
// Get and render chat response to user message
|
||||
await this.getChatResponse(user_message);
|
||||
});
|
||||
}
|
||||
|
||||
async onOpen() {
|
||||
let { contentEl } = this;
|
||||
contentEl.addClass("khoj-chat");
|
||||
|
||||
// Add title to the Khoj Chat modal
|
||||
contentEl.createEl("h1", ({ attr: { id: "khoj-chat-title" }, text: "Khoj Chat" }));
|
||||
|
||||
// Create area for chat logs
|
||||
contentEl.createDiv({ attr: { id: "khoj-chat-body", class: "khoj-chat-body" } });
|
||||
|
||||
// Get conversation history from Khoj backend
|
||||
let chatUrl = `${this.setting.khojUrl}/api/chat?`;
|
||||
let response = await request(chatUrl);
|
||||
let chatLogs = JSON.parse(response).response;
|
||||
chatLogs.forEach((chatLog: any) => {
|
||||
this.renderMessageWithReferences(chatLog.message, chatLog.by, chatLog.context, new Date(chatLog.created));
|
||||
});
|
||||
|
||||
// Add chat input field
|
||||
contentEl.createEl("input",
|
||||
{
|
||||
attr: {
|
||||
type: "text",
|
||||
id: "khoj-chat-input",
|
||||
autofocus: "autofocus",
|
||||
placeholder: "Chat with Khoj 🦅 [Hit Enter to send message]",
|
||||
class: "khoj-chat-input option"
|
||||
}
|
||||
})
|
||||
.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value });
|
||||
|
||||
// Scroll to bottom of modal, till the send message input box
|
||||
this.modalEl.scrollTop = this.modalEl.scrollHeight;
|
||||
}
|
||||
|
||||
generateReference(messageEl: any, reference: string, index: number) {
|
||||
// Generate HTML for Chat Reference
|
||||
// `<sup><abbr title="${escaped_ref}" tabindex="0">${index}</abbr></sup>`;
|
||||
let escaped_ref = reference.replace(/"/g, "\\\"")
|
||||
return messageEl.createEl("sup").createEl("abbr", {
|
||||
attr: {
|
||||
title: escaped_ref,
|
||||
tabindex: "0",
|
||||
},
|
||||
text: `[${index}] `,
|
||||
});
|
||||
}
|
||||
|
||||
renderMessageWithReferences(message: string, sender: string, context?: [string], dt?: Date) {
|
||||
let messageEl = this.renderMessage(message, sender, dt);
|
||||
if (context && !!messageEl) {
|
||||
context.map((reference, index) => this.generateReference(messageEl, reference, index+1));
|
||||
}
|
||||
}
|
||||
|
||||
renderMessage(message: string, sender: string, dt?: Date): Element | null {
|
||||
let message_time = this.formatDate(dt ?? new Date());
|
||||
let emojified_sender = sender == "khoj" ? "🦅 Khoj" : "🤔 You";
|
||||
|
||||
// Append message to conversation history HTML element.
|
||||
// The chat logs should display above the message input box to follow standard UI semantics
|
||||
let chat_body_el = this.contentEl.getElementsByClassName("khoj-chat-body")[0];
|
||||
let chat_message_el = chat_body_el.createDiv({
|
||||
attr: {
|
||||
"data-meta": `${emojified_sender} at ${message_time}`,
|
||||
class: `khoj-chat-message ${sender}`
|
||||
},
|
||||
}).createDiv({
|
||||
attr: {
|
||||
class: `khoj-chat-message-text ${sender}`
|
||||
},
|
||||
text: `${message}`
|
||||
})
|
||||
|
||||
// Scroll to bottom after inserting chat messages
|
||||
this.modalEl.scrollTop = this.modalEl.scrollHeight;
|
||||
|
||||
return chat_message_el
|
||||
}
|
||||
|
||||
formatDate(date: Date): string {
|
||||
// Format date in HH:MM, DD MMM YYYY format
|
||||
let time_string = date.toLocaleTimeString('en-IN', { hour: '2-digit', minute: '2-digit', hour12: false });
|
||||
let date_string = date.toLocaleString('en-IN', { year: 'numeric', month: 'short', day: '2-digit' }).replace(/-/g, ' ');
|
||||
return `${time_string}, ${date_string}`;
|
||||
}
|
||||
|
||||
async getChatResponse(query: string | undefined | null): Promise<void> {
|
||||
// Exit if query is empty
|
||||
if (!query || query === "") return;
|
||||
|
||||
// Render user query as chat message
|
||||
this.renderMessage(query, "you");
|
||||
|
||||
// Get chat response from Khoj backend
|
||||
let encodedQuery = encodeURIComponent(query);
|
||||
let chatUrl = `${this.setting.khojUrl}/api/chat?q=${encodedQuery}`;
|
||||
let response = await request(chatUrl);
|
||||
let data = JSON.parse(response);
|
||||
|
||||
// Render Khoj response as chat message
|
||||
this.renderMessage(data.response, "khoj");
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
import { Notice, Plugin } from 'obsidian';
|
||||
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
|
||||
import { KhojModal } from 'src/modal'
|
||||
import { KhojSearchModal } from 'src/search_modal'
|
||||
import { KhojChatModal } from 'src/chat_modal'
|
||||
import { configureKhojBackend } from './utils';
|
||||
|
||||
|
||||
@@ -16,7 +17,7 @@ export default class Khoj extends Plugin {
|
||||
name: 'Search',
|
||||
checkCallback: (checking) => {
|
||||
if (!checking && this.settings.connectedToBackend)
|
||||
new KhojModal(this.app, this.settings).open();
|
||||
new KhojSearchModal(this.app, this.settings).open();
|
||||
return this.settings.connectedToBackend;
|
||||
}
|
||||
});
|
||||
@@ -27,16 +28,27 @@ export default class Khoj extends Plugin {
|
||||
name: 'Find similar notes',
|
||||
editorCheckCallback: (checking) => {
|
||||
if (!checking && this.settings.connectedToBackend)
|
||||
new KhojModal(this.app, this.settings, true).open();
|
||||
new KhojSearchModal(this.app, this.settings, true).open();
|
||||
return this.settings.connectedToBackend;
|
||||
}
|
||||
});
|
||||
|
||||
// Add chat command. It can be triggered from anywhere
|
||||
this.addCommand({
|
||||
id: 'chat',
|
||||
name: 'Chat',
|
||||
checkCallback: (checking) => {
|
||||
if (!checking && this.settings.connectedToBackend && !!this.settings.openaiApiKey)
|
||||
new KhojChatModal(this.app, this.settings).open();
|
||||
return !!this.settings.openaiApiKey;
|
||||
}
|
||||
});
|
||||
|
||||
// Create an icon in the left ribbon.
|
||||
this.addRibbonIcon('search', 'Khoj', (_: MouseEvent) => {
|
||||
// Called when the user clicks the icon.
|
||||
this.settings.connectedToBackend
|
||||
? new KhojModal(this.app, this.settings).open()
|
||||
? new KhojSearchModal(this.app, this.settings).open()
|
||||
: new Notice(`❗️Ensure Khoj backend is running and Khoj URL is pointing to it in the plugin settings`);
|
||||
});
|
||||
|
||||
@@ -59,5 +71,5 @@ export default class Khoj extends Plugin {
|
||||
await configureKhojBackend(this.app.vault, this.settings, false);
|
||||
}
|
||||
this.saveData(this.settings);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
import { App, SuggestModal, request, MarkdownRenderer, Instruction, Platform } from 'obsidian';
|
||||
import { KhojSetting } from 'src/settings';
|
||||
import { createNoteAndCloseModal } from 'src/utils';
|
||||
|
||||
export interface SearchResult {
|
||||
entry: string;
|
||||
file: string;
|
||||
}
|
||||
|
||||
export class KhojModal extends SuggestModal<SearchResult> {
|
||||
export class KhojSearchModal extends SuggestModal<SearchResult> {
|
||||
setting: KhojSetting;
|
||||
rerank: boolean = false;
|
||||
find_similar_notes: boolean;
|
||||
query: string = "";
|
||||
app: App;
|
||||
|
||||
constructor(app: App, setting: KhojSetting, find_similar_notes: boolean = false) {
|
||||
@@ -31,6 +33,14 @@ export class KhojModal extends SuggestModal<SearchResult> {
|
||||
this.rerank = false
|
||||
});
|
||||
|
||||
// Register Modal Keybindings to Create New Note with Query as Title
|
||||
this.scope.register(['Shift'], 'Enter', async () => {
|
||||
if (this.query != "") createNoteAndCloseModal(this.query, this);
|
||||
});
|
||||
this.scope.register(['Ctrl', 'Shift'], 'Enter', async () => {
|
||||
if (this.query != "") createNoteAndCloseModal(this.query, this, { newLeaf: true });
|
||||
});
|
||||
|
||||
// Add Hints to Modal for available Keybindings
|
||||
const modalInstructions: Instruction[] = [
|
||||
{
|
||||
@@ -86,16 +96,31 @@ export class KhojModal extends SuggestModal<SearchResult> {
|
||||
.filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
|
||||
.map((result: any) => { return { entry: result.entry, file: result.additional.file } as SearchResult; });
|
||||
|
||||
this.query = query;
|
||||
return results;
|
||||
}
|
||||
|
||||
async renderSuggestion(result: SearchResult, el: HTMLElement) {
|
||||
let words_to_render = 30;
|
||||
let entry_words = result.entry.split(' ')
|
||||
let entry_snipped_indicator = entry_words.length > words_to_render ? ' **...**' : '';
|
||||
let snipped_entry = entry_words.slice(0, words_to_render).join(' ');
|
||||
// Max number of lines to render
|
||||
let lines_to_render = 8;
|
||||
|
||||
// Extract filename of result
|
||||
let os_path_separator = result.file.includes('\\') ? '\\' : '/';
|
||||
let filename = result.file.split(os_path_separator).pop();
|
||||
|
||||
// Remove YAML frontmatter when rendering string
|
||||
result.entry = result.entry.replace(/---[\n\r][\s\S]*---[\n\r]/, '');
|
||||
|
||||
// Truncate search results to lines_to_render
|
||||
let entry_snipped_indicator = result.entry.split('\n').length > lines_to_render ? ' **...**' : '';
|
||||
let snipped_entry = result.entry.split('\n').slice(0, lines_to_render).join('\n');
|
||||
|
||||
// Show filename of each search result for context
|
||||
el.createEl("div",{ cls: 'khoj-result-file' }).setText(filename ?? "");
|
||||
let result_el = el.createEl("div", { cls: 'khoj-result-entry' })
|
||||
|
||||
// @ts-ignore
|
||||
MarkdownRenderer.renderMarkdown(snipped_entry + entry_snipped_indicator, el, null, null);
|
||||
MarkdownRenderer.renderMarkdown(snipped_entry + entry_snipped_indicator, result_el, null, null);
|
||||
}
|
||||
|
||||
async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) {
|
||||
@@ -109,7 +134,7 @@ export class KhojModal extends SuggestModal<SearchResult> {
|
||||
.sort((a, b) => b.path.length - a.path.length)
|
||||
// The first match is the best file match across OS
|
||||
// e.g Khoj server on Linux, Obsidian vault on Android
|
||||
.find(file => result.file.endsWith(file.path))
|
||||
.find(file => result.file.replace(/\\/g, "/").endsWith(file.path))
|
||||
|
||||
// Open vault file at heading of chosen search result
|
||||
if (file_match) {
|
||||
@@ -2,6 +2,7 @@ import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian';
|
||||
import Khoj from 'src/main';
|
||||
|
||||
export interface KhojSetting {
|
||||
openaiApiKey: string;
|
||||
resultsCount: number;
|
||||
khojUrl: string;
|
||||
connectedToBackend: boolean;
|
||||
@@ -13,6 +14,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
|
||||
khojUrl: 'http://localhost:8000',
|
||||
connectedToBackend: false,
|
||||
autoConfigure: true,
|
||||
openaiApiKey: '',
|
||||
}
|
||||
|
||||
export class KhojSettingTab extends PluginSettingTab {
|
||||
@@ -41,7 +43,16 @@ export class KhojSettingTab extends PluginSettingTab {
|
||||
await this.plugin.saveSettings();
|
||||
containerEl.firstElementChild?.setText(this.getBackendStatusMessage());
|
||||
}));
|
||||
new Setting(containerEl)
|
||||
new Setting(containerEl)
|
||||
.setName('OpenAI API Key')
|
||||
.setDesc('Your OpenAI API Key for Khoj Chat')
|
||||
.addText(text => text
|
||||
.setValue(`${this.plugin.settings.openaiApiKey}`)
|
||||
.onChange(async (value) => {
|
||||
this.plugin.settings.openaiApiKey = value.trim();
|
||||
await this.plugin.saveSettings();
|
||||
}));
|
||||
new Setting(containerEl)
|
||||
.setName('Results Count')
|
||||
.setDesc('The number of search results to show')
|
||||
.addSlider(slider => slider
|
||||
@@ -110,7 +121,7 @@ export class KhojSettingTab extends PluginSettingTab {
|
||||
|
||||
getBackendStatusMessage() {
|
||||
return !this.plugin.settings.connectedToBackend
|
||||
? '❗Disconnected from Khoj backend. Ensure Khoj backend is running and Khoj URL is correctly set below.'
|
||||
: '✅ Connected to Khoj backend.';
|
||||
? '❗Disconnected from Khoj backend. Ensure Khoj backend is running and Khoj URL is correctly set below.'
|
||||
: '✅ Connected to Khoj backend.';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault } from 'obsidian';
|
||||
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian';
|
||||
import { KhojSetting } from 'src/settings'
|
||||
|
||||
export function getVaultAbsolutePath(vault: Vault): string {
|
||||
@@ -29,10 +29,11 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||
|
||||
// Set index name from the path of the current vault
|
||||
let indexName = getVaultAbsolutePath(vault).replace(/\//g, '_').replace(/ /g, '_');
|
||||
// Get default index directory from khoj backend
|
||||
let khojDefaultIndexDirectory = await request(`${khojConfigUrl}/default`)
|
||||
.then(response => JSON.parse(response))
|
||||
.then(data => { return getIndexDirectoryFromBackendConfig(data); });
|
||||
// Get default config fields from khoj backend
|
||||
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
||||
let khojDefaultIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
||||
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
||||
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"];
|
||||
|
||||
// Get current config if khoj backend configured, else get default config from khoj backend
|
||||
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
|
||||
@@ -49,14 +50,7 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||
"compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
// Disable khoj processors, as not required
|
||||
delete data["processor"];
|
||||
|
||||
// Save new config and refresh index on khoj backend
|
||||
updateKhojBackend(setting.khojUrl, data);
|
||||
console.log(`Khoj: Created khoj backend config:\n${JSON.stringify(data)}`)
|
||||
}
|
||||
|
||||
// Else if khoj config has no markdown content config
|
||||
else if (!data["content-type"]["markdown"]) {
|
||||
// Add markdown config to khoj content-type config
|
||||
@@ -67,28 +61,59 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||
"embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
|
||||
// Save updated config and refresh index on khoj backend
|
||||
updateKhojBackend(setting.khojUrl, data);
|
||||
console.log(`Khoj: Added markdown config to khoj backend config:\n${JSON.stringify(data["content-type"])}`)
|
||||
}
|
||||
|
||||
// Else if khoj is not configured to index markdown files in configured obsidian vault
|
||||
else if (data["content-type"]["markdown"]["input-filter"].length != 1 ||
|
||||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
|
||||
// Update markdown config in khoj content-type config
|
||||
// Set markdown config to only index markdown files in configured obsidian vault
|
||||
let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data);
|
||||
let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
||||
data["content-type"]["markdown"] = {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
// Save updated config and refresh index on khoj backend
|
||||
updateKhojBackend(setting.khojUrl, data);
|
||||
console.log(`Khoj: Updated markdown config in khoj backend config:\n${JSON.stringify(data["content-type"]["markdown"])}`)
|
||||
}
|
||||
|
||||
// If OpenAI API key not set in Khoj plugin settings
|
||||
if (!setting.openaiApiKey) {
|
||||
// Disable khoj processors, as not required
|
||||
delete data["processor"];
|
||||
}
|
||||
// Else if khoj backend not configured yet
|
||||
else if (!khoj_already_configured || !data["processor"]) {
|
||||
data["processor"] = {
|
||||
"conversation": {
|
||||
"conversation-logfile": `${khojDefaultChatDirectory}/conversation.json`,
|
||||
"model": khojDefaultChatModelName,
|
||||
"openai-api-key": setting.openaiApiKey,
|
||||
}
|
||||
}
|
||||
}
|
||||
// Else if khoj config has no conversation processor config
|
||||
else if (!data["processor"]["conversation"]) {
|
||||
data["processor"]["conversation"] = {
|
||||
"conversation-logfile": `${khojDefaultChatDirectory}/conversation.json`,
|
||||
"model": khojDefaultChatModelName,
|
||||
"openai-api-key": setting.openaiApiKey,
|
||||
}
|
||||
}
|
||||
// Else if khoj is not configured with OpenAI API key from khoj plugin settings
|
||||
else if (data["processor"]["conversation"]["openai-api-key"] !== setting.openaiApiKey) {
|
||||
data["processor"]["conversation"] = {
|
||||
"conversation-logfile": data["processor"]["conversation"]["conversation-logfile"],
|
||||
"model": data["processor"]["conversation"]["model"],
|
||||
"openai-api-key": setting.openaiApiKey,
|
||||
}
|
||||
}
|
||||
|
||||
// Save updated config and refresh index on khoj backend
|
||||
updateKhojBackend(setting.khojUrl, data);
|
||||
if (!khoj_already_configured)
|
||||
console.log(`Khoj: Created khoj backend config:\n${JSON.stringify(data)}`)
|
||||
else
|
||||
console.log(`Khoj: Updated khoj backend config:\n${JSON.stringify(data)}`)
|
||||
})
|
||||
.catch(error => {
|
||||
if (notify)
|
||||
@@ -111,6 +136,39 @@ export async function updateKhojBackend(khojUrl: string, khojConfig: Object) {
|
||||
.then(_ => request(`${khojUrl}/api/update?t=markdown`));
|
||||
}
|
||||
|
||||
function getIndexDirectoryFromBackendConfig(khojConfig: any) {
|
||||
return khojConfig["content-type"]["markdown"]["embeddings-file"].split("/").slice(0, -1).join("/");
|
||||
function getIndexDirectoryFromBackendConfig(filepath: string) {
|
||||
return filepath.split("/").slice(0, -1).join("/");
|
||||
}
|
||||
|
||||
export async function createNote(name: string, newLeaf = false): Promise<void> {
|
||||
try {
|
||||
let pathPrefix: string
|
||||
// @ts-ignore
|
||||
switch (app.vault.getConfig('newFileLocation')) {
|
||||
case 'current':
|
||||
pathPrefix = (app.workspace.getActiveFile()?.parent.path ?? '') + '/'
|
||||
break
|
||||
case 'folder':
|
||||
pathPrefix = this.app.vault.getConfig('newFileFolderPath') + '/'
|
||||
break
|
||||
default: // 'root'
|
||||
pathPrefix = ''
|
||||
break
|
||||
}
|
||||
await app.workspace.openLinkText(`${pathPrefix}${name}.md`, '', newLeaf)
|
||||
} catch (e) {
|
||||
console.error('Khoj: Could not create note.\n' + (e as any).message);
|
||||
throw e
|
||||
}
|
||||
}
|
||||
|
||||
export async function createNoteAndCloseModal(query: string, modal: Modal, opt?: { newLeaf: boolean }): Promise<void> {
|
||||
try {
|
||||
await createNote(query, opt?.newLeaf);
|
||||
}
|
||||
catch (e) {
|
||||
new Notice((e as Error).message)
|
||||
return
|
||||
}
|
||||
modal.close();
|
||||
}
|
||||
|
||||
@@ -6,3 +6,171 @@ available in the app when your plugin is enabled.
|
||||
If your plugin does not need CSS, delete this file.
|
||||
|
||||
*/
|
||||
|
||||
:root {
|
||||
--khoj-chat-blue: #017eff;
|
||||
--khoj-chat-dark-grey: #475569;
|
||||
}
|
||||
|
||||
.khoj-chat {
|
||||
display: grid;
|
||||
background: var(--background-primary);
|
||||
color: var(--text-normal);
|
||||
text-align: center;
|
||||
font-family: roboto, karma, segoe ui, sans-serif;
|
||||
font-size: var(--font-ui-large);
|
||||
font-weight: 300;
|
||||
line-height: 1.5em;
|
||||
}
|
||||
.khoj-chat > * {
|
||||
padding: 10px;
|
||||
margin: 10px;
|
||||
}
|
||||
|
||||
#khoj-chat-title {
|
||||
font-weight: 200;
|
||||
color: var(--khoj-chat-blue);
|
||||
}
|
||||
|
||||
#khoj-chat-body {
|
||||
font-size: var(--font-ui-medium);
|
||||
margin: 0px;
|
||||
line-height: 20px;
|
||||
overflow-y: scroll; /* Make chat body scroll to see history */
|
||||
}
|
||||
/* add chat metatdata to bottom of bubble */
|
||||
.khoj-chat-message::after {
|
||||
content: attr(data-meta);
|
||||
display: block;
|
||||
font-size: var(--font-ui-smaller);
|
||||
color: var(--text-muted);
|
||||
margin: -12px 7px 0 -5px;
|
||||
}
|
||||
/* move message by khoj to left */
|
||||
.khoj-chat-message.khoj {
|
||||
margin-left: auto;
|
||||
text-align: left;
|
||||
}
|
||||
/* move message by you to right */
|
||||
.khoj-chat-message.you {
|
||||
margin-right: auto;
|
||||
text-align: right;
|
||||
}
|
||||
/* basic style chat message text */
|
||||
.khoj-chat-message-text {
|
||||
margin: 10px;
|
||||
border-radius: 10px;
|
||||
padding: 10px;
|
||||
position: relative;
|
||||
display: inline-block;
|
||||
max-width: 80%;
|
||||
text-align: left;
|
||||
}
|
||||
/* color chat bubble by khoj blue */
|
||||
.khoj-chat-message-text.khoj {
|
||||
color: var(--text-on-accent);
|
||||
background: var(--khoj-chat-blue);
|
||||
margin-left: auto;
|
||||
white-space: pre-line;
|
||||
}
|
||||
/* add left protrusion to khoj chat bubble */
|
||||
.khoj-chat-message-text.khoj:after {
|
||||
content: '';
|
||||
position: absolute;
|
||||
bottom: -2px;
|
||||
left: -7px;
|
||||
border: 10px solid transparent;
|
||||
border-top-color: var(--khoj-chat-blue);
|
||||
border-bottom: 0;
|
||||
transform: rotate(-60deg);
|
||||
}
|
||||
/* color chat bubble by you dark grey */
|
||||
.khoj-chat-message-text.you {
|
||||
color: var(--text-on-accent);
|
||||
background: var(--khoj-chat-dark-grey);
|
||||
margin-right: auto;
|
||||
}
|
||||
/* add right protrusion to you chat bubble */
|
||||
.khoj-chat-message-text.you:after {
|
||||
content: '';
|
||||
position: absolute;
|
||||
top: 91%;
|
||||
right: -2px;
|
||||
border: 10px solid transparent;
|
||||
border-left-color: var(--khoj-chat-dark-grey);
|
||||
border-right: 0;
|
||||
margin-top: -10px;
|
||||
transform: rotate(-60deg)
|
||||
}
|
||||
|
||||
#khoj-chat-footer {
|
||||
padding: 0;
|
||||
display: grid;
|
||||
grid-template-columns: minmax(70px, 100%);
|
||||
grid-column-gap: 10px;
|
||||
grid-row-gap: 10px;
|
||||
}
|
||||
#khoj-chat-footer > * {
|
||||
padding: 15px;
|
||||
background: #f9fafc
|
||||
}
|
||||
#khoj-chat-input.option:hover {
|
||||
box-shadow: 0 0 11px var(--background-modifier-box-shadow);
|
||||
}
|
||||
#khoj-chat-input {
|
||||
font-size: var(--font-ui-medium);
|
||||
padding: 25px 20px;
|
||||
}
|
||||
|
||||
@media (pointer: coarse), (hover: none) {
|
||||
#khoj-chat-body.abbr[title] {
|
||||
position: relative;
|
||||
padding-left: 4px; /* space references out to ease tapping */
|
||||
}
|
||||
#khoj-chat-body.abbr[title]:focus:after {
|
||||
content: attr(title);
|
||||
|
||||
/* position tooltip */
|
||||
position: absolute;
|
||||
left: 16px; /* open tooltip to right of ref link, instead of on top of it */
|
||||
width: auto;
|
||||
z-index: 1; /* show tooltip above chat messages */
|
||||
|
||||
/* style tooltip */
|
||||
background-color: var(--background-secondary);
|
||||
color: var(--text-muted);
|
||||
border-radius: 2px;
|
||||
box-shadow: 1px 1px 4px 0 var(--background-modifier-box-shadow);
|
||||
font-size: var(--font-ui-small);
|
||||
padding: 2px 4px;
|
||||
}
|
||||
}
|
||||
|
||||
.khoj-result-file {
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.khoj-result-entry {
|
||||
color: var(--text-muted);
|
||||
margin-left: 2em;
|
||||
padding-left: 0.5em;
|
||||
line-height: normal;
|
||||
margin-top: 0.2em;
|
||||
margin-bottom: 0.2em;
|
||||
border-left-style: solid;
|
||||
border-left-color: var(--color-accent-2);
|
||||
white-space: normal;
|
||||
}
|
||||
|
||||
.khoj-result-entry > * {
|
||||
font-size: var(--font-ui-medium);
|
||||
}
|
||||
|
||||
.khoj-result-entry > p {
|
||||
margin-top: 0.2em;
|
||||
margin-bottom: 0.2em;
|
||||
}
|
||||
|
||||
.khoj-result-entry p br {
|
||||
display: none;
|
||||
}
|
||||
|
||||
@@ -2,5 +2,8 @@
|
||||
"0.2.1": "0.15.0",
|
||||
"0.2.5": "0.15.0",
|
||||
"0.2.6": "0.15.0",
|
||||
"0.3.0": "0.15.0"
|
||||
"0.3.0": "0.15.0",
|
||||
"0.4.0": "0.15.0",
|
||||
"0.5.0": "0.15.0",
|
||||
"0.6.0": "0.15.0"
|
||||
}
|
||||
|
||||
@@ -2,18 +2,22 @@
|
||||
import sys
|
||||
import logging
|
||||
import json
|
||||
from enum import Enum
|
||||
|
||||
# External Packages
|
||||
import schedule
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.conversation.gpt import summarize
|
||||
from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
from khoj.utils import state
|
||||
from khoj.utils.helpers import LRU, resolve_absolute_path
|
||||
from khoj.utils.helpers import LRU, resolve_absolute_path, merge_dicts
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from khoj.search_filter.word_filter import WordFilter
|
||||
@@ -39,23 +43,49 @@ def configure_server(args, required=False):
|
||||
# Initialize Processor from Config
|
||||
state.processor_config = configure_processor(args.config.processor)
|
||||
|
||||
# Initialize the search model from Config
|
||||
# Initialize the search type and model from Config
|
||||
state.search_index_lock.acquire()
|
||||
state.SearchType = configure_search_types(state.config)
|
||||
state.model = configure_search(state.model, state.config, args.regenerate)
|
||||
state.search_index_lock.release()
|
||||
|
||||
|
||||
def configure_routes(app):
|
||||
# Import APIs here to setup search types before while configuring server
|
||||
from khoj.routers.api import api
|
||||
from khoj.routers.api_beta import api_beta
|
||||
from khoj.routers.web_client import web_client
|
||||
|
||||
app.mount("/static", StaticFiles(directory=constants.web_directory), name="static")
|
||||
app.include_router(api, prefix="/api")
|
||||
app.include_router(api_beta, prefix="/api/beta")
|
||||
app.include_router(web_client)
|
||||
|
||||
|
||||
@schedule.repeat(schedule.every(1).hour)
|
||||
def update_search_index():
|
||||
state.search_index_lock.acquire()
|
||||
state.model = configure_search(state.model, state.config, regenerate=False)
|
||||
state.search_index_lock.release()
|
||||
logger.info("Search Index updated via Scheduler")
|
||||
logger.info("📬 Search index updated via Scheduler")
|
||||
|
||||
|
||||
def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None):
|
||||
def configure_search_types(config: FullConfig):
|
||||
# Extract core search types
|
||||
core_search_types = {e.name: e.value for e in SearchType}
|
||||
# Extract configured plugin search types
|
||||
plugin_search_types = {}
|
||||
if config.content_type.plugins:
|
||||
plugin_search_types = {plugin_type: plugin_type for plugin_type in config.content_type.plugins.keys()}
|
||||
|
||||
# Dynamically generate search type enum by merging core search types with configured plugin search types
|
||||
return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))
|
||||
|
||||
|
||||
def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: state.SearchType = None):
|
||||
# Initialize Org Notes Search
|
||||
if (t == SearchType.Org or t == None) and config.content_type.org:
|
||||
if (t == state.SearchType.Org or t == None) and config.content_type.org:
|
||||
logger.info("🦄 Setting up search for orgmode notes")
|
||||
# Extract Entries, Generate Notes Embeddings
|
||||
model.orgmode_search = text_search.setup(
|
||||
OrgToJsonl,
|
||||
@@ -66,7 +96,8 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||
)
|
||||
|
||||
# Initialize Org Music Search
|
||||
if (t == SearchType.Music or t == None) and config.content_type.music:
|
||||
if (t == state.SearchType.Music or t == None) and config.content_type.music:
|
||||
logger.info("🎺 Setting up search for org-music")
|
||||
# Extract Entries, Generate Music Embeddings
|
||||
model.music_search = text_search.setup(
|
||||
OrgToJsonl,
|
||||
@@ -77,7 +108,8 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||
)
|
||||
|
||||
# Initialize Markdown Search
|
||||
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
|
||||
if (t == state.SearchType.Markdown or t == None) and config.content_type.markdown:
|
||||
logger.info("💎 Setting up search for markdown notes")
|
||||
# Extract Entries, Generate Markdown Embeddings
|
||||
model.markdown_search = text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
@@ -88,7 +120,8 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||
)
|
||||
|
||||
# Initialize Ledger Search
|
||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||
if (t == state.SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||
logger.info("💸 Setting up search for ledger")
|
||||
# Extract Entries, Generate Ledger Embeddings
|
||||
model.ledger_search = text_search.setup(
|
||||
BeancountToJsonl,
|
||||
@@ -99,12 +132,26 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||
)
|
||||
|
||||
# Initialize Image Search
|
||||
if (t == SearchType.Image or t == None) and config.content_type.image:
|
||||
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
||||
logger.info("🌄 Setting up search for images")
|
||||
# Extract Entries, Generate Image Embeddings
|
||||
model.image_search = image_search.setup(
|
||||
config.content_type.image, search_config=config.search_type.image, regenerate=regenerate
|
||||
)
|
||||
|
||||
# Initialize External Plugin Search
|
||||
if (t == None or t in state.SearchType) and config.content_type.plugins:
|
||||
logger.info("🔌 Setting up search for plugins")
|
||||
model.plugin_search = {}
|
||||
for plugin_type, plugin_config in config.content_type.plugins.items():
|
||||
model.plugin_search[plugin_type] = text_search.setup(
|
||||
JsonlToJsonl,
|
||||
plugin_config,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Invalidate Query Cache
|
||||
state.query_cache = LRU()
|
||||
|
||||
@@ -119,6 +166,7 @@ def configure_processor(processor_config: ProcessorConfig):
|
||||
|
||||
# Initialize Conversation Processor
|
||||
if processor_config.conversation:
|
||||
logger.info("💬 Setting up conversation processor")
|
||||
processor.conversation = configure_conversation_processor(processor_config.conversation)
|
||||
|
||||
return processor
|
||||
@@ -132,10 +180,46 @@ def configure_conversation_processor(conversation_processor_config):
|
||||
# Load Metadata Logs from Conversation Logfile
|
||||
with conversation_logfile.open("r") as f:
|
||||
conversation_processor.meta_log = json.load(f)
|
||||
logger.info("Conversation logs loaded from disk.")
|
||||
logger.debug(f"Loaded conversation logs from {conversation_logfile}")
|
||||
else:
|
||||
# Initialize Conversation Logs
|
||||
conversation_processor.meta_log = {}
|
||||
conversation_processor.chat_session = ""
|
||||
|
||||
return conversation_processor
|
||||
|
||||
|
||||
@schedule.repeat(schedule.every(15).minutes)
|
||||
def save_chat_session():
|
||||
# No need to create empty log file
|
||||
if not (
|
||||
state.processor_config
|
||||
and state.processor_config.conversation
|
||||
and state.processor_config.conversation.meta_log
|
||||
and state.processor_config.conversation.chat_session
|
||||
):
|
||||
return
|
||||
|
||||
# Summarize Conversation Logs for this Session
|
||||
chat_session = state.processor_config.conversation.chat_session
|
||||
openai_api_key = state.processor_config.conversation.openai_api_key
|
||||
conversation_log = state.processor_config.conversation.meta_log
|
||||
model = state.processor_config.conversation.model
|
||||
session = {
|
||||
"summary": summarize(chat_session, summary_type="chat", model=model, api_key=openai_api_key),
|
||||
"session-start": conversation_log.get("session", [{"session-end": 0}])[-1]["session-end"],
|
||||
"session-end": len(conversation_log["chat"]),
|
||||
}
|
||||
if "session" in conversation_log:
|
||||
conversation_log["session"].append(session)
|
||||
else:
|
||||
conversation_log["session"] = [session]
|
||||
|
||||
# Save Conversation Metadata Logs to Disk
|
||||
conversation_logfile = resolve_absolute_path(state.processor_config.conversation.conversation_logfile)
|
||||
conversation_logfile.parent.mkdir(parents=True, exist_ok=True) # create conversation directory if doesn't exist
|
||||
with open(conversation_logfile, "w+", encoding="utf-8") as logfile:
|
||||
json.dump(conversation_log, logfile, indent=2)
|
||||
|
||||
state.processor_config.conversation.chat_session = None
|
||||
logger.info("📩 Saved current chat session to conversation logs")
|
||||
|
||||
@@ -58,13 +58,18 @@ class MainWindow(QtWidgets.QMainWindow):
|
||||
# Add Settings Panels for each Search Type to Configure Window Layout
|
||||
self.search_settings_panels = []
|
||||
for search_type in SearchType:
|
||||
current_content_config = self.current_config["content-type"].get(search_type, {})
|
||||
current_content_config = self.current_config["content-type"].get(
|
||||
search_type, None
|
||||
) or self.get_default_config(search_type=search_type)
|
||||
self.search_settings_panels += [self.add_settings_panel(current_content_config, search_type)]
|
||||
|
||||
# Add Conversation Processor Panel to Configure Screen
|
||||
self.processor_settings_panels = []
|
||||
conversation_type = ProcessorType.Conversation
|
||||
current_conversation_config = self.current_config["processor"].get(conversation_type, {})
|
||||
if self.current_config["processor"] and conversation_type in self.current_config["processor"]:
|
||||
current_conversation_config = self.current_config["processor"][conversation_type]
|
||||
else:
|
||||
current_conversation_config = self.get_default_config(processor_type=conversation_type)
|
||||
self.processor_settings_panels += [self.add_processor_panel(current_conversation_config, conversation_type)]
|
||||
|
||||
# Add Action Buttons Panel
|
||||
|
||||
@@ -6,15 +6,9 @@
|
||||
|
||||
<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 144 144%22><text y=%22.86em%22 font-size=%22144%22>🦅</text></svg>">
|
||||
<link rel="icon" type="image/png" sizes="144x144" href="/static/assets/icons/favicon-144x144.png">
|
||||
<link rel="manifest" href="/static/khoj.webmanifest">
|
||||
<link rel="manifest" href="/static/khoj_chat.webmanifest">
|
||||
</head>
|
||||
<script>
|
||||
function setTypeFieldInUrl(type) {
|
||||
let url = new URL(window.location.href);
|
||||
url.searchParams.set("t", type.value);
|
||||
window.history.pushState({}, "", url.href);
|
||||
}
|
||||
|
||||
function formatDate(date) {
|
||||
// Format date in HH:MM, DD MMM YYYY format
|
||||
let time_string = date.toLocaleTimeString('en-IN', { hour: '2-digit', minute: '2-digit', hour12: false });
|
||||
@@ -22,6 +16,12 @@
|
||||
return `${time_string}, ${date_string}`;
|
||||
}
|
||||
|
||||
function generateReference(reference, index) {
|
||||
// Generate HTML for Chat Reference
|
||||
let escaped_ref = reference.replaceAll("\"", "\\\"")
|
||||
return `<sup><abbr title="${escaped_ref}" tabindex="0">${index}</abbr></sup>`;
|
||||
}
|
||||
|
||||
function renderMessage(message, by, dt=null) {
|
||||
let message_time = formatDate(dt ?? new Date());
|
||||
let by_name = by == "khoj" ? "🦅 Khoj" : "🤔 You";
|
||||
@@ -31,15 +31,25 @@
|
||||
<div class="chat-message-text ${by}">${message}</div>
|
||||
</div>
|
||||
`;
|
||||
// Scroll to bottom of input-body element
|
||||
// Scroll to bottom of chat-body element
|
||||
document.getElementById("chat-body").scrollTop = document.getElementById("chat-body").scrollHeight;
|
||||
}
|
||||
|
||||
function renderMessageWithReference(message, by, context=null, dt=null) {
|
||||
let references = '';
|
||||
if (context) {
|
||||
references = context
|
||||
.map((reference, index) => generateReference(reference, index))
|
||||
.join("<sup>,</sup>");
|
||||
}
|
||||
|
||||
renderMessage(message+references, by, dt);
|
||||
}
|
||||
|
||||
function chat() {
|
||||
// Extract required fields for search from form
|
||||
query = document.getElementById("chat-input").value.trim();
|
||||
type_ = document.getElementById("chat-type").value;
|
||||
console.log(`Query: ${query}, Type: ${type_}`);
|
||||
let query = document.getElementById("chat-input").value.trim();
|
||||
console.log(`Query: ${query}`);
|
||||
|
||||
// Short circuit on empty query
|
||||
if (query.length === 0)
|
||||
@@ -50,18 +60,15 @@
|
||||
document.getElementById("chat-input").value = "";
|
||||
|
||||
// Generate backend API URL to execute query
|
||||
url = type_ === "chat"
|
||||
? `/api/beta/chat?q=${encodeURIComponent(query)}`
|
||||
: `/api/beta/summarize?q=${encodeURIComponent(query)}`;
|
||||
let url = `/api/chat?q=${encodeURIComponent(query)}`;
|
||||
|
||||
// Call specified Khoj API
|
||||
fetch(url)
|
||||
.then(response => response.json())
|
||||
.then(data => data.response)
|
||||
.then(response => {
|
||||
.then(data => {
|
||||
// Render message by Khoj to chat body
|
||||
console.log(response);
|
||||
renderMessage(response, "khoj");
|
||||
console.log(data.response);
|
||||
renderMessageWithReference(data.response, "khoj", data.context);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -73,18 +80,13 @@
|
||||
}
|
||||
|
||||
window.onload = function () {
|
||||
// Fill type field with value passed in URL query parameters, if any.
|
||||
var type_via_url = new URLSearchParams(window.location.search).get("t");
|
||||
if (type_via_url)
|
||||
document.getElementById("chat-type").value = type_via_url;
|
||||
|
||||
fetch('/api/beta/chat')
|
||||
fetch('/api/chat')
|
||||
.then(response => response.json())
|
||||
.then(data => data.response)
|
||||
.then(chat_logs => {
|
||||
// Render conversation history, if any
|
||||
chat_logs.forEach(chat_log => {
|
||||
renderMessage(chat_log.message, chat_log.by, new Date(chat_log.created));
|
||||
renderMessageWithReference(chat_log.message, chat_log.by, chat_log.context, new Date(chat_log.created));
|
||||
});
|
||||
});
|
||||
|
||||
@@ -109,12 +111,6 @@
|
||||
<!-- Chat Footer -->
|
||||
<div id="chat-footer">
|
||||
<input type="text" id="chat-input" class="option" onkeyup=incrementalChat(event) autofocus="autofocus" placeholder="What is the meaning of life?">
|
||||
|
||||
<!--Select Chat Type from: Chat, Summarize -->
|
||||
<select id="chat-type" class="option" onchange="setTypeFieldInUrl(this)">
|
||||
<option value="chat">Chat</option>
|
||||
<option value="summarize">Summarize</option>
|
||||
</select>
|
||||
</div>
|
||||
</body>
|
||||
|
||||
@@ -183,6 +179,7 @@
|
||||
color: #f8fafc;
|
||||
background: #017eff;
|
||||
margin-left: auto;
|
||||
white-space: pre-line;
|
||||
}
|
||||
/* add left protrusion to khoj chat bubble */
|
||||
.chat-message-text.khoj:after {
|
||||
@@ -217,7 +214,7 @@
|
||||
#chat-footer {
|
||||
padding: 0;
|
||||
display: grid;
|
||||
grid-template-columns: minmax(70px, 85%) auto;
|
||||
grid-template-columns: minmax(70px, 100%);
|
||||
grid-column-gap: 10px;
|
||||
grid-row-gap: 10px;
|
||||
}
|
||||
@@ -234,6 +231,29 @@
|
||||
font-size: medium;
|
||||
}
|
||||
|
||||
@media (pointer: coarse), (hover: none) {
|
||||
abbr[title] {
|
||||
position: relative;
|
||||
padding-left: 4px; /* space references out to ease tapping */
|
||||
}
|
||||
abbr[title]:focus:after {
|
||||
content: attr(title);
|
||||
|
||||
/* position tooltip */
|
||||
position: absolute;
|
||||
left: 16px; /* open tooltip to right of ref link, instead of on top of it */
|
||||
width: auto;
|
||||
z-index: 1; /* show tooltip above chat messages */
|
||||
|
||||
/* style tooltip */
|
||||
background-color: #aaa;
|
||||
color: #f8fafc;
|
||||
border-radius: 2px;
|
||||
box-shadow: 1px 1px 4px 0 rgba(0, 0, 0, 0.4);
|
||||
font-size: 14px;
|
||||
padding: 2px 4px;
|
||||
}
|
||||
}
|
||||
@media only screen and (max-width: 600px) {
|
||||
body {
|
||||
grid-template-columns: 1fr;
|
||||
|
||||
@@ -56,7 +56,9 @@
|
||||
} else if (type === "ledger") {
|
||||
return render_ledger(query, data);
|
||||
} else {
|
||||
return `<pre id="json">${JSON.stringify(data, null, 2)}</pre>`;
|
||||
return `<div id="results-plugin">`
|
||||
+ data.map((item) => `<p>${item.entry}</p>`).join("\n")
|
||||
+ `</div>`;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,21 +118,21 @@
|
||||
}
|
||||
|
||||
function populate_type_dropdown() {
|
||||
// Populate type dropdown field with enabled search types only
|
||||
var possible_search_types = ["org", "markdown", "ledger", "music", "image"];
|
||||
fetch("/api/config/data")
|
||||
// Populate type dropdown field with enabled content types only
|
||||
fetch("/api/config/types")
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
.then(enabled_types => {
|
||||
document.getElementById("type").innerHTML =
|
||||
possible_search_types
|
||||
.filter(type => data["content-type"].hasOwnProperty(type) && data["content-type"][type])
|
||||
enabled_types
|
||||
.map(type => `<option value="${type}">${type.slice(0,1).toUpperCase() + type.slice(1)}</option>`)
|
||||
.join('');
|
||||
|
||||
return enabled_types;
|
||||
})
|
||||
.then(() => {
|
||||
// Set type field to search type passed in URL query parameter, if valid
|
||||
.then(enabled_types => {
|
||||
// Set type field to content type passed in URL query parameter, if valid
|
||||
var type_via_url = new URLSearchParams(window.location.search).get("t");
|
||||
if (type_via_url && possible_search_types.includes(type_via_url))
|
||||
if (type_via_url && enabled_types.includes(type_via_url))
|
||||
document.getElementById("type").value = type_via_url;
|
||||
});
|
||||
}
|
||||
@@ -154,7 +156,7 @@
|
||||
}
|
||||
|
||||
window.onload = function () {
|
||||
// Dynamically populate type dropdown based on enabled search types and type passed as URL query parameter
|
||||
// Dynamically populate type dropdown based on enabled content types and type passed as URL query parameter
|
||||
populate_type_dropdown();
|
||||
|
||||
// Set results count field with value passed in URL query parameters, if any.
|
||||
@@ -277,9 +279,10 @@
|
||||
#json {
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
#results-plugin,
|
||||
#results-ledger {
|
||||
white-space: pre-line;
|
||||
text-align: left;
|
||||
white-space: pre-line;
|
||||
}
|
||||
#results-markdown {
|
||||
text-align: left;
|
||||
|
||||
16
src/khoj/interface/web/khoj_chat.webmanifest
Normal file
16
src/khoj/interface/web/khoj_chat.webmanifest
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "Khoj Chat",
|
||||
"short_name": "Khoj Chat",
|
||||
"description": "A personal assistant for your notes",
|
||||
"icons": [
|
||||
{
|
||||
"src": "/static/assets/icons/favicon-144x144.png",
|
||||
"sizes": "144x144",
|
||||
"type": "image/png"
|
||||
}
|
||||
],
|
||||
"theme_color": "#ffffff",
|
||||
"background_color": "#ffffff",
|
||||
"display": "standalone",
|
||||
"start_url": "/chat"
|
||||
}
|
||||
@@ -14,18 +14,14 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th
|
||||
# External Packages
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from PyQt6 import QtWidgets
|
||||
from PyQt6.QtCore import QThread, QTimer
|
||||
from rich.logging import RichHandler
|
||||
import schedule
|
||||
|
||||
# Internal Packages
|
||||
from khoj.configure import configure_server
|
||||
from khoj.routers.api import api
|
||||
from khoj.routers.api_beta import api_beta
|
||||
from khoj.routers.web_client import web_client
|
||||
from khoj.utils import constants, state
|
||||
from khoj.configure import configure_routes, configure_server
|
||||
from khoj.utils import state
|
||||
from khoj.utils.cli import cli
|
||||
from khoj.interface.desktop.main_window import MainWindow
|
||||
from khoj.interface.desktop.system_tray import create_system_tray
|
||||
@@ -33,10 +29,6 @@ from khoj.interface.desktop.system_tray import create_system_tray
|
||||
|
||||
# Initialize the Application Server
|
||||
app = FastAPI()
|
||||
app.mount("/static", StaticFiles(directory=constants.web_directory), name="static")
|
||||
app.include_router(api, prefix="/api")
|
||||
app.include_router(api_beta, prefix="/api/beta")
|
||||
app.include_router(web_client)
|
||||
|
||||
# Setup Logger
|
||||
rich_handler = RichHandler(rich_tracebacks=True)
|
||||
@@ -60,10 +52,8 @@ def run():
|
||||
|
||||
# Set Logging Level
|
||||
if args.verbose == 0:
|
||||
logger.setLevel(logging.WARN)
|
||||
elif args.verbose == 1:
|
||||
logger.setLevel(logging.INFO)
|
||||
elif args.verbose >= 2:
|
||||
elif args.verbose >= 1:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# Set Log File
|
||||
@@ -71,13 +61,14 @@ def run():
|
||||
fh.setLevel(logging.DEBUG)
|
||||
logger.addHandler(fh)
|
||||
|
||||
logger.info("Starting Khoj...")
|
||||
logger.info("🌘 Starting Khoj")
|
||||
|
||||
if args.no_gui:
|
||||
# Setup task scheduler
|
||||
poll_task_scheduler()
|
||||
# Start Server
|
||||
configure_server(args, required=False)
|
||||
configure_routes(app)
|
||||
start_server(app, host=args.host, port=args.port, socket=args.socket)
|
||||
else:
|
||||
# Setup GUI
|
||||
@@ -95,6 +86,7 @@ def run():
|
||||
|
||||
# Setup Server
|
||||
configure_server(args, required=False)
|
||||
configure_routes(app)
|
||||
server = ServerThread(app, args.host, args.port, args.socket)
|
||||
|
||||
# Show Main Window on First Run Experience or if on Linux
|
||||
@@ -128,7 +120,6 @@ def run():
|
||||
|
||||
|
||||
def sigint_handler(*args):
|
||||
print("\nShutting down Khoj...")
|
||||
QtWidgets.QApplication.quit()
|
||||
|
||||
|
||||
@@ -141,10 +132,12 @@ def set_state(args):
|
||||
|
||||
|
||||
def start_server(app, host=None, port=None, socket=None):
|
||||
logger.info("🌖 Khoj is ready to use")
|
||||
if socket:
|
||||
uvicorn.run(app, proxy_headers=True, uds=socket, log_level="debug", use_colors=True, log_config=None)
|
||||
else:
|
||||
uvicorn.run(app, host=host, port=port, log_level="debug", use_colors=True, log_config=None)
|
||||
logger.info("🌒 Stopping Khoj")
|
||||
|
||||
|
||||
def poll_task_scheduler():
|
||||
|
||||
@@ -1,22 +1,56 @@
|
||||
# Standard Packages
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
# External Packages
|
||||
import openai
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
from khoj.processor.conversation.utils import (
|
||||
chat_completion_with_backoff,
|
||||
completion_with_backoff,
|
||||
message_to_prompt,
|
||||
generate_chatml_messages_with_context,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def answer(text, user_query, model, api_key=None, temperature=0.5, max_tokens=500):
|
||||
"""
|
||||
Answer user query using provided text as reference with OpenAI's GPT
|
||||
"""
|
||||
# Setup Prompt based on Summary Type
|
||||
prompt = f"""
|
||||
You are a friendly, helpful personal assistant.
|
||||
Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
|
||||
|
||||
Notes:
|
||||
{text}
|
||||
|
||||
Question: {user_query}
|
||||
|
||||
Answer (in second person):"""
|
||||
# Get Response from GPT
|
||||
logger.debug(f"Prompt for GPT: {prompt}")
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stop='"""',
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = response["choices"][0]["text"]
|
||||
return str(story).replace("\n\n", "")
|
||||
|
||||
|
||||
def summarize(text, summary_type, model, user_query=None, api_key=None, temperature=0.5, max_tokens=200):
|
||||
"""
|
||||
Summarize user input using OpenAI's GPT
|
||||
"""
|
||||
# Initialize Variables
|
||||
openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
|
||||
# Setup Prompt based on Summary Type
|
||||
if summary_type == "chat":
|
||||
prompt = f"""
|
||||
@@ -34,8 +68,15 @@ Summarize the below notes about {user_query}:
|
||||
Summarize the notes in second person perspective:"""
|
||||
|
||||
# Get Response from GPT
|
||||
response = openai.Completion.create(
|
||||
prompt=prompt, model=model, temperature=temperature, max_tokens=max_tokens, frequency_penalty=0.2, stop='"""'
|
||||
logger.debug(f"Prompt for GPT: {prompt}")
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
frequency_penalty=0.2,
|
||||
stop='"""',
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
@@ -43,12 +84,114 @@ Summarize the notes in second person perspective:"""
|
||||
return str(story).replace("\n\n", "")
|
||||
|
||||
|
||||
def extract_questions(text, model="text-davinci-003", conversation_log={}, api_key=None, temperature=0, max_tokens=100):
|
||||
"""
|
||||
Infer search queries to retrieve relevant notes to answer user query
|
||||
"""
|
||||
# Extract Past User Message and Inferred Questions from Conversation Log
|
||||
chat_history = "".join(
|
||||
[
|
||||
f'Q: {chat["intent"]["query"]}\n\n{chat["intent"].get("inferred-queries") or list([chat["intent"]["query"]])}\n\n{chat["message"]}\n\n'
|
||||
for chat in conversation_log.get("chat", [])[-4:]
|
||||
if chat["by"] == "khoj"
|
||||
]
|
||||
)
|
||||
|
||||
# Get dates relative to today for prompt creation
|
||||
today = datetime.today()
|
||||
current_new_year = today.replace(month=1, day=1)
|
||||
last_new_year = current_new_year.replace(year=today.year - 1)
|
||||
|
||||
prompt = f"""
|
||||
You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the users notes.
|
||||
- The user will provide their questions and answers to you for context.
|
||||
- Add as much context from the previous questions and answers as required into your search queries.
|
||||
- Break messages into multiple search queries when required to retrieve the relevant information.
|
||||
- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
|
||||
|
||||
What searches, if any, will you need to perform to answer the users question?
|
||||
Provide search queries as a JSON list of strings
|
||||
Current Date: {today.strftime("%A, %Y-%m-%d")}
|
||||
|
||||
Q: How was my trip to Cambodia?
|
||||
|
||||
["How was my trip to Cambodia?"]
|
||||
|
||||
A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
|
||||
|
||||
Q: Who did i visit that temple with?
|
||||
|
||||
["Who did I visit the Angkor Wat Temple in Cambodia with?"]
|
||||
|
||||
A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
|
||||
|
||||
Q: What national parks did I go to last year?
|
||||
|
||||
["National park I visited in {last_new_year.strftime("%Y")} dt>=\\"{last_new_year.strftime("%Y-%m-%d")}\\" dt<\\"{current_new_year.strftime("%Y-%m-%d")}\\""]
|
||||
|
||||
A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year.strftime("%Y")}.
|
||||
|
||||
Q: How are you feeling today?
|
||||
|
||||
[]
|
||||
|
||||
A: I'm feeling a little bored. Helping you will hopefully make me feel better!
|
||||
|
||||
Q: How many tennis balls fit in the back of a 2002 Honda Civic?
|
||||
|
||||
["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
|
||||
|
||||
A: 1085 tennis balls will fit in the trunk of a Honda Civic
|
||||
|
||||
Q: Is Bob older than Tom?
|
||||
|
||||
["When was Bob born?", "What is Tom's age?"]
|
||||
|
||||
A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
|
||||
|
||||
Q: What is their age difference?
|
||||
|
||||
["What is Bob's age?", "What is Tom's age?"]
|
||||
|
||||
A: Bob is {current_new_year.year - 1984 - 30} years older than Tom. As Bob is {current_new_year.year - 1984} years old and Tom is 30 years old.
|
||||
|
||||
{chat_history}
|
||||
Q: {text}
|
||||
|
||||
"""
|
||||
|
||||
# Get Response from GPT
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stop=["A: ", "\n"],
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
response_text = response["choices"][0]["text"]
|
||||
try:
|
||||
questions = json.loads(
|
||||
# Clean response to increase likelihood of valid JSON. E.g replace ' with " to enclose strings
|
||||
response_text.strip(empty_escape_sequences)
|
||||
.replace("['", '["')
|
||||
.replace("']", '"]')
|
||||
.replace("', '", '", "')
|
||||
)
|
||||
except json.decoder.JSONDecodeError:
|
||||
logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response_text}")
|
||||
questions = [text]
|
||||
logger.debug(f"Extracted Questions by GPT: {questions}")
|
||||
return questions
|
||||
|
||||
|
||||
def extract_search_type(text, model, api_key=None, temperature=0.5, max_tokens=100, verbose=0):
|
||||
"""
|
||||
Extract search type from user query using OpenAI's GPT
|
||||
"""
|
||||
# Initialize Variables
|
||||
openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
understand_primer = """
|
||||
Objective: Extract search type from user query and return information as JSON
|
||||
|
||||
@@ -77,143 +220,57 @@ A:{ "search-type": "notes" }"""
|
||||
print(f"Message -> Prompt: {text} -> {prompt}")
|
||||
|
||||
# Get Response from GPT
|
||||
response = openai.Completion.create(
|
||||
prompt=prompt, model=model, temperature=temperature, max_tokens=max_tokens, frequency_penalty=0.2, stop=["\n"]
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = str(response["choices"][0]["text"])
|
||||
return json.loads(story.strip(empty_escape_sequences))
|
||||
|
||||
|
||||
def understand(text, model, api_key=None, temperature=0.5, max_tokens=100, verbose=0):
|
||||
"""
|
||||
Understand user input using OpenAI's GPT
|
||||
"""
|
||||
# Initialize Variables
|
||||
openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
understand_primer = """
|
||||
Objective: Extract intent and trigger emotion information as JSON from each chat message
|
||||
|
||||
Potential intent types and valid argument values are listed below:
|
||||
- intent
|
||||
- remember(memory-type, query);
|
||||
- memory-type=["companion","notes","ledger","image","music"]
|
||||
- search(search-type, query);
|
||||
- search-type=["google"]
|
||||
- generate(activity, query);
|
||||
- activity=["paint","write","chat"]
|
||||
- trigger-emotion(emotion)
|
||||
- emotion=["happy","confidence","fear","surprise","sadness","disgust","anger","shy","curiosity","calm"]
|
||||
|
||||
Some examples are given below for reference:
|
||||
Q: How are you doing?
|
||||
A: { "intent": {"type": "generate", "activity": "chat", "query": "How are you doing?"}, "trigger-emotion": "happy" }
|
||||
Q: Do you remember what I told you about my brother Antoine when we were at the beach?
|
||||
A: { "intent": {"type": "remember", "memory-type": "companion", "query": "Brother Antoine when we were at the beach"}, "trigger-emotion": "curiosity" }
|
||||
Q: what was that fantasy story you told me last time?
|
||||
A: { "intent": {"type": "remember", "memory-type": "companion", "query": "fantasy story told last time"}, "trigger-emotion": "curiosity" }
|
||||
Q: Let's make some drawings about the stars on a clear full moon night!
|
||||
A: { "intent": {"type": "generate", "activity": "paint", "query": "stars on a clear full moon night"}, "trigger-emotion: "happy" }
|
||||
Q: Do you know anything about Lebanon cuisine in the 18th century?
|
||||
A: { "intent": {"type": "search", "search-type": "google", "query": "lebanon cusine in the 18th century"}, "trigger-emotion; "confidence" }
|
||||
Q: Tell me a scary story
|
||||
A: { "intent": {"type": "generate", "activity": "write", "query": "A scary story"}, "trigger-emotion": "fear" }
|
||||
Q: What fiction book was I reading last week about AI starship?
|
||||
A: { "intent": {"type": "remember", "memory-type": "notes", "query": "fiction book about AI starship last week"}, "trigger-emotion": "curiosity" }
|
||||
Q: How much did I spend at Subway for dinner last time?
|
||||
A: { "intent": {"type": "remember", "memory-type": "ledger", "query": "last Subway dinner"}, "trigger-emotion": "calm" }
|
||||
Q: I'm feeling sleepy
|
||||
A: { "intent": {"type": "generate", "activity": "chat", "query": "I'm feeling sleepy"}, "trigger-emotion": "calm" }
|
||||
Q: What was that popular Sri lankan song that Alex had mentioned?
|
||||
A: { "intent": {"type": "remember", "memory-type": "music", "query": "popular Sri lankan song mentioned by Alex"}, "trigger-emotion": "curiosity" }
|
||||
Q: You're pretty funny!
|
||||
A: { "intent": {"type": "generate", "activity": "chat", "query": "You're pretty funny!"}, "trigger-emotion": "shy" }
|
||||
Q: Can you recommend a movie to watch from my notes?
|
||||
A: { "intent": {"type": "remember", "memory-type": "notes", "query": "recommend movie to watch"}, "trigger-emotion": "curiosity" }
|
||||
Q: When did I go surfing last?
|
||||
A: { "intent": {"type": "remember", "memory-type": "notes", "query": "When did I go surfing last"}, "trigger-emotion": "calm" }
|
||||
Q: Can you dance for me?
|
||||
A: { "intent": {"type": "generate", "activity": "chat", "query": "Can you dance for me?"}, "trigger-emotion": "sad" }"""
|
||||
|
||||
# Setup Prompt with Understand Primer
|
||||
prompt = message_to_prompt(text, understand_primer, start_sequence="\nA:", restart_sequence="\nQ:")
|
||||
if verbose > 1:
|
||||
print(f"Message -> Prompt: {text} -> {prompt}")
|
||||
|
||||
# Get Response from GPT
|
||||
response = openai.Completion.create(
|
||||
prompt=prompt, model=model, temperature=temperature, max_tokens=max_tokens, frequency_penalty=0.2, stop=["\n"]
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = str(response["choices"][0]["text"])
|
||||
return json.loads(story.strip(empty_escape_sequences))
|
||||
|
||||
|
||||
def converse(text, model, conversation_history=None, api_key=None, temperature=0.9, max_tokens=150):
|
||||
"""
|
||||
Converse with user using OpenAI's GPT
|
||||
"""
|
||||
# Initialize Variables
|
||||
max_words = 500
|
||||
openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
|
||||
conversation_primer = f"""
|
||||
The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and a very friendly companion.
|
||||
|
||||
Human: Hello, who are you?
|
||||
AI: Hi, I am an AI conversational companion created by OpenAI. How can I help you today?"""
|
||||
|
||||
# Setup Prompt with Primer or Conversation History
|
||||
prompt = message_to_prompt(text, conversation_history or conversation_primer)
|
||||
prompt = " ".join(prompt.split()[:max_words])
|
||||
|
||||
# Get Response from GPT
|
||||
response = openai.Completion.create(
|
||||
logger.debug(f"Prompt for GPT: {prompt}")
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
presence_penalty=0.6,
|
||||
stop=["\n", "Human:", "AI:"],
|
||||
frequency_penalty=0.2,
|
||||
stop=["\n"],
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = str(response["choices"][0]["text"])
|
||||
return json.loads(story.strip(empty_escape_sequences))
|
||||
|
||||
|
||||
def converse(references, user_query, conversation_log={}, api_key=None, temperature=0.2):
|
||||
"""
|
||||
Converse with user using OpenAI's ChatGPT
|
||||
"""
|
||||
# Initialize Variables
|
||||
model = "gpt-3.5-turbo"
|
||||
compiled_references = "\n\n".join({f"# {item}" for item in references})
|
||||
|
||||
personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant."
|
||||
conversation_primer = f"""
|
||||
Using the notes and our past conversations as context, answer the following question.
|
||||
Current Date: {datetime.now().strftime("%Y-%m-%d")}
|
||||
|
||||
Notes:
|
||||
{compiled_references}
|
||||
|
||||
Question: {user_query}"""
|
||||
|
||||
# Setup Prompt with Primer or Conversation History
|
||||
messages = generate_chatml_messages_with_context(
|
||||
conversation_primer,
|
||||
personality_primer,
|
||||
conversation_log,
|
||||
model,
|
||||
)
|
||||
|
||||
# Get Response from GPT
|
||||
logger.debug(f"Conversation Context for GPT: {messages}")
|
||||
response = chat_completion_with_backoff(
|
||||
messages=messages,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = str(response["choices"][0]["message"]["content"])
|
||||
return story.strip(empty_escape_sequences)
|
||||
|
||||
|
||||
def message_to_prompt(
|
||||
user_message, conversation_history="", gpt_message=None, start_sequence="\nAI:", restart_sequence="\nHuman:"
|
||||
):
|
||||
"""Create prompt for GPT from messages and conversation history"""
|
||||
gpt_message = f" {gpt_message}" if gpt_message else ""
|
||||
|
||||
return f"{conversation_history}{restart_sequence} {user_message}{start_sequence}{gpt_message}"
|
||||
|
||||
|
||||
def message_to_log(user_message, gpt_message, user_message_metadata={}, conversation_log=[]):
|
||||
"""Create json logs from messages, metadata for conversation log"""
|
||||
default_user_message_metadata = {
|
||||
"intent": {"type": "remember", "memory-type": "notes", "query": user_message},
|
||||
"trigger-emotion": "calm",
|
||||
}
|
||||
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# Create json log from Human's message
|
||||
human_log = user_message_metadata or default_user_message_metadata
|
||||
human_log["message"] = user_message
|
||||
human_log["by"] = "you"
|
||||
human_log["created"] = current_dt
|
||||
|
||||
# Create json log from GPT's response
|
||||
khoj_log = {"message": gpt_message, "by": "khoj", "created": current_dt}
|
||||
|
||||
conversation_log.extend([human_log, khoj_log])
|
||||
return conversation_log
|
||||
|
||||
|
||||
def extract_summaries(metadata):
|
||||
"""Extract summaries from metadata"""
|
||||
return "".join([f'\n{session["summary"]}' for session in metadata])
|
||||
|
||||
132
src/khoj/processor/conversation/utils.py
Normal file
132
src/khoj/processor/conversation/utils.py
Normal file
@@ -0,0 +1,132 @@
|
||||
# Standard Packages
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
# External Packages
|
||||
import openai
|
||||
import tiktoken
|
||||
from tenacity import (
|
||||
before_sleep_log,
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
wait_random_exponential,
|
||||
)
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.helpers import merge_dicts
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
max_prompt_size = {"gpt-3.5-turbo": 4096, "gpt-4": 8192}
|
||||
|
||||
|
||||
@retry(
|
||||
retry=(
|
||||
retry_if_exception_type(openai.error.Timeout)
|
||||
| retry_if_exception_type(openai.error.APIError)
|
||||
| retry_if_exception_type(openai.error.APIConnectionError)
|
||||
| retry_if_exception_type(openai.error.RateLimitError)
|
||||
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
||||
),
|
||||
wait=wait_random_exponential(min=1, max=30),
|
||||
stop=stop_after_attempt(6),
|
||||
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
||||
reraise=True,
|
||||
)
|
||||
def completion_with_backoff(**kwargs):
|
||||
openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
|
||||
return openai.Completion.create(**kwargs, request_timeout=60)
|
||||
|
||||
|
||||
@retry(
|
||||
retry=(
|
||||
retry_if_exception_type(openai.error.Timeout)
|
||||
| retry_if_exception_type(openai.error.APIError)
|
||||
| retry_if_exception_type(openai.error.APIConnectionError)
|
||||
| retry_if_exception_type(openai.error.RateLimitError)
|
||||
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
||||
),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
stop=stop_after_attempt(6),
|
||||
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
||||
reraise=True,
|
||||
)
|
||||
def chat_completion_with_backoff(**kwargs):
|
||||
openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
|
||||
return openai.ChatCompletion.create(**kwargs, request_timeout=60)
|
||||
|
||||
|
||||
def generate_chatml_messages_with_context(
|
||||
user_message, system_message, conversation_log={}, model_name="gpt-3.5-turbo", lookback_turns=2
|
||||
):
|
||||
"""Generate messages for ChatGPT with context from previous conversation"""
|
||||
# Extract Chat History for Context
|
||||
chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
|
||||
rest_backnforths = []
|
||||
# Extract in reverse chronological order
|
||||
for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]):
|
||||
if len(rest_backnforths) >= 2 * lookback_turns:
|
||||
break
|
||||
rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1]
|
||||
|
||||
# Format user and system messages to chatml format
|
||||
system_chatml_message = [message_to_chatml(system_message, "system")]
|
||||
user_chatml_message = [message_to_chatml(user_message, "user")]
|
||||
|
||||
messages = user_chatml_message + rest_backnforths[:2] + system_chatml_message + rest_backnforths[2:]
|
||||
|
||||
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
||||
encoder = tiktoken.encoding_for_model(model_name)
|
||||
tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
|
||||
while tokens > max_prompt_size[model_name]:
|
||||
messages.pop()
|
||||
tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
|
||||
|
||||
# Return message in chronological order
|
||||
return messages[::-1]
|
||||
|
||||
|
||||
def reciprocal_conversation_to_chatml(message_pair):
|
||||
"""Convert a single back and forth between user and assistant to chatml format"""
|
||||
return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
|
||||
|
||||
|
||||
def message_to_chatml(message, role="assistant"):
|
||||
"""Create chatml message from message and role"""
|
||||
return {"role": role, "content": message}
|
||||
|
||||
|
||||
def message_to_prompt(
|
||||
user_message, conversation_history="", gpt_message=None, start_sequence="\nAI:", restart_sequence="\nHuman:"
|
||||
):
|
||||
"""Create prompt for GPT from messages and conversation history"""
|
||||
gpt_message = f" {gpt_message}" if gpt_message else ""
|
||||
|
||||
return f"{conversation_history}{restart_sequence} {user_message}{start_sequence}{gpt_message}"
|
||||
|
||||
|
||||
def message_to_log(user_message, gpt_message, user_message_metadata={}, khoj_message_metadata={}, conversation_log=[]):
|
||||
"""Create json logs from messages, metadata for conversation log"""
|
||||
default_khoj_message_metadata = {
|
||||
"intent": {"type": "remember", "memory-type": "notes", "query": user_message},
|
||||
"trigger-emotion": "calm",
|
||||
}
|
||||
khoj_response_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# Create json log from Human's message
|
||||
human_log = merge_dicts({"message": user_message, "by": "you"}, user_message_metadata)
|
||||
|
||||
# Create json log from GPT's response
|
||||
khoj_log = merge_dicts(khoj_message_metadata, default_khoj_message_metadata)
|
||||
khoj_log = merge_dicts({"message": gpt_message, "by": "khoj", "created": khoj_response_time}, khoj_log)
|
||||
|
||||
conversation_log.extend([human_log, khoj_log])
|
||||
return conversation_log
|
||||
|
||||
|
||||
def extract_summaries(metadata):
|
||||
"""Extract summaries from metadata"""
|
||||
return "".join([f'\n{session["summary"]}' for session in metadata])
|
||||
0
src/khoj/processor/jsonl/__init__.py
Normal file
0
src/khoj/processor/jsonl/__init__.py
Normal file
100
src/khoj/processor/jsonl/jsonl_to_jsonl.py
Normal file
100
src/khoj/processor/jsonl/jsonl_to_jsonl.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# Standard Packages
|
||||
import glob
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, timer
|
||||
from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JsonlToJsonl(TextToJsonl):
|
||||
# Define Functions
|
||||
def process(self, previous_entries=None):
|
||||
# Extract required fields from config
|
||||
input_jsonl_files, input_jsonl_filter, output_file = (
|
||||
self.config.input_files,
|
||||
self.config.input_filter,
|
||||
self.config.compressed_jsonl,
|
||||
)
|
||||
|
||||
# Get Jsonl Input Files to Process
|
||||
all_input_jsonl_files = JsonlToJsonl.get_jsonl_files(input_jsonl_files, input_jsonl_filter)
|
||||
|
||||
# Extract Entries from specified jsonl files
|
||||
with timer("Parse entries from jsonl files", logger):
|
||||
input_jsons = JsonlToJsonl.extract_jsonl_entries(all_input_jsonl_files)
|
||||
current_entries = list(map(Entry.from_dict, input_jsons))
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = self.mark_entries_for_update(
|
||||
current_entries,
|
||||
previous_entries,
|
||||
key="compiled",
|
||||
logger=logger,
|
||||
)
|
||||
|
||||
with timer("Write entries to JSONL file", logger):
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||
jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
@staticmethod
|
||||
def get_jsonl_files(jsonl_files=None, jsonl_file_filters=None):
|
||||
"Get all jsonl files to process"
|
||||
absolute_jsonl_files, filtered_jsonl_files = set(), set()
|
||||
if jsonl_files:
|
||||
absolute_jsonl_files = {get_absolute_path(jsonl_file) for jsonl_file in jsonl_files}
|
||||
if jsonl_file_filters:
|
||||
filtered_jsonl_files = {
|
||||
filtered_file
|
||||
for jsonl_file_filter in jsonl_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
|
||||
}
|
||||
|
||||
all_jsonl_files = sorted(absolute_jsonl_files | filtered_jsonl_files)
|
||||
|
||||
files_with_non_jsonl_extensions = {
|
||||
jsonl_file for jsonl_file in all_jsonl_files if not jsonl_file.endswith(".jsonl")
|
||||
}
|
||||
if any(files_with_non_jsonl_extensions):
|
||||
print(f"[Warning] There maybe non jsonl files in the input set: {files_with_non_jsonl_extensions}")
|
||||
|
||||
logger.debug(f"Processing files: {all_jsonl_files}")
|
||||
|
||||
return all_jsonl_files
|
||||
|
||||
@staticmethod
|
||||
def extract_jsonl_entries(jsonl_files):
|
||||
"Extract entries from specified jsonl files"
|
||||
entries = []
|
||||
for jsonl_file in jsonl_files:
|
||||
entries.extend(load_jsonl(Path(jsonl_file)))
|
||||
return entries
|
||||
|
||||
@staticmethod
|
||||
def convert_entries_to_jsonl(entries: List[Entry]):
|
||||
"Convert each entry to JSON and collate as JSONL"
|
||||
return "".join([f"{entry.to_json()}\n" for entry in entries])
|
||||
@@ -88,7 +88,7 @@ class BeancountToJsonl(TextToJsonl):
|
||||
if any(files_with_non_beancount_extensions):
|
||||
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
||||
|
||||
logger.info(f"Processing files: {all_beancount_files}")
|
||||
logger.debug(f"Processing files: {all_beancount_files}")
|
||||
|
||||
return all_beancount_files
|
||||
|
||||
@@ -123,7 +123,7 @@ class BeancountToJsonl(TextToJsonl):
|
||||
Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{transaction_to_file_map[parsed_entry]}")
|
||||
)
|
||||
|
||||
logger.info(f"Converted {len(parsed_entries)} transactions to dictionaries")
|
||||
logger.debug(f"Converted {len(parsed_entries)} transactions to dictionaries")
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
# Standard Packages
|
||||
import glob
|
||||
import re
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
# Internal Packages
|
||||
@@ -92,7 +93,7 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
|
||||
)
|
||||
|
||||
logger.info(f"Processing files: {all_markdown_files}")
|
||||
logger.debug(f"Processing files: {all_markdown_files}")
|
||||
|
||||
return all_markdown_files
|
||||
|
||||
@@ -124,9 +125,12 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
"Convert each Markdown entries into a dictionary"
|
||||
entries = []
|
||||
for parsed_entry in parsed_entries:
|
||||
entries.append(Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{entry_to_file_map[parsed_entry]}"))
|
||||
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||
# Append base filename to compiled entry for context to model
|
||||
compiled_entry = f"{parsed_entry}\n{entry_filename.stem}"
|
||||
entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}"))
|
||||
|
||||
logger.info(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ class OrgToJsonl(TextToJsonl):
|
||||
if any(files_with_non_org_extensions):
|
||||
logger.warn(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
|
||||
|
||||
logger.info(f"Processing files: {all_org_files}")
|
||||
logger.debug(f"Processing files: {all_org_files}")
|
||||
|
||||
return all_org_files
|
||||
|
||||
|
||||
@@ -73,6 +73,7 @@ def makelist(filename):
|
||||
level = ""
|
||||
heading = ""
|
||||
bodytext = ""
|
||||
introtext = ""
|
||||
tags = list() # set of all tags in headline
|
||||
closed_date = ""
|
||||
sched_date = ""
|
||||
@@ -133,7 +134,7 @@ def makelist(filename):
|
||||
file_title += f" {title_text}"
|
||||
continue
|
||||
|
||||
# Ignore Properties Drawers Completely
|
||||
# Ignore Properties Drawer Start, End Lines
|
||||
if re.search(":PROPERTIES:", line):
|
||||
in_properties_drawer = True
|
||||
continue
|
||||
@@ -190,20 +191,33 @@ def makelist(filename):
|
||||
and not clocked_re
|
||||
and line[:1] != "#"
|
||||
):
|
||||
bodytext = bodytext + line
|
||||
# if we are in a heading
|
||||
if heading:
|
||||
# add the line to the bodytext
|
||||
bodytext += line
|
||||
# else we are in the pre heading portion of the file
|
||||
elif line.strip():
|
||||
# so add the line to the introtext
|
||||
introtext += line
|
||||
|
||||
# write out last node
|
||||
thisNode = Orgnode(level, heading or file_title, bodytext, tags)
|
||||
thisNode.properties = property_map
|
||||
if sched_date:
|
||||
thisNode.scheduled = sched_date
|
||||
if deadline_date:
|
||||
thisNode.deadline = deadline_date
|
||||
if closed_date:
|
||||
thisNode.closed = closed_date
|
||||
if logbook:
|
||||
thisNode.logbook = logbook
|
||||
nodelist.append(thisNode)
|
||||
# write out intro node before headings
|
||||
# this is done at the end to allow collating all title lines
|
||||
if introtext:
|
||||
thisNode = Orgnode(level, file_title, introtext, tags)
|
||||
nodelist = [thisNode] + nodelist
|
||||
# write out last heading node
|
||||
if heading:
|
||||
thisNode = Orgnode(level, heading, bodytext, tags)
|
||||
thisNode.properties = property_map
|
||||
if sched_date:
|
||||
thisNode.scheduled = sched_date
|
||||
if deadline_date:
|
||||
thisNode.deadline = deadline_date
|
||||
if closed_date:
|
||||
thisNode.closed = closed_date
|
||||
if logbook:
|
||||
thisNode.logbook = logbook
|
||||
nodelist.append(thisNode)
|
||||
|
||||
# using the list of TODO keywords found in the file
|
||||
# process the headings searching for TODO keywords
|
||||
|
||||
@@ -31,7 +31,7 @@ class TextToJsonl(ABC):
|
||||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||
chunked_entries: List[Entry] = []
|
||||
for entry in entries:
|
||||
compiled_entry_words = entry.compiled.split()
|
||||
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
||||
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
||||
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
# Standard Packages
|
||||
import math
|
||||
import yaml
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Union
|
||||
|
||||
# External Packages
|
||||
from fastapi import APIRouter
|
||||
@@ -9,13 +11,14 @@ from fastapi import HTTPException
|
||||
|
||||
# Internal Packages
|
||||
from khoj.configure import configure_processor, configure_search
|
||||
from khoj.processor.conversation.gpt import converse, extract_questions
|
||||
from khoj.processor.conversation.utils import message_to_log, message_to_prompt
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import FullConfig, SearchResponse
|
||||
from khoj.utils.config import SearchType
|
||||
from khoj.utils.state import SearchType
|
||||
from khoj.utils import state, constants
|
||||
|
||||
|
||||
# Initialize Router
|
||||
api = APIRouter()
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -27,6 +30,24 @@ def get_default_config_data():
|
||||
return constants.default_config
|
||||
|
||||
|
||||
@api.get("/config/types", response_model=List[str])
|
||||
def get_config_types():
|
||||
"""Get configured content types"""
|
||||
if state.config is None or state.config.content_type is None:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Content types not configured. Configure at least one content type on server and restart it.",
|
||||
)
|
||||
|
||||
configured_content_types = state.config.content_type.dict(exclude_none=True)
|
||||
return [
|
||||
search_type.value
|
||||
for search_type in SearchType
|
||||
if search_type.value in configured_content_types
|
||||
or ("plugins" in configured_content_types and search_type.name in configured_content_types["plugins"])
|
||||
]
|
||||
|
||||
|
||||
@api.get("/config/data", response_model=FullConfig)
|
||||
def get_config_data():
|
||||
return state.config
|
||||
@@ -42,26 +63,36 @@ async def set_config_data(updated_config: FullConfig):
|
||||
|
||||
|
||||
@api.get("/search", response_model=List[SearchResponse])
|
||||
def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Optional[bool] = False):
|
||||
def search(
|
||||
q: str,
|
||||
n: Optional[int] = 5,
|
||||
t: Optional[SearchType] = None,
|
||||
r: Optional[bool] = False,
|
||||
score_threshold: Optional[Union[float, None]] = None,
|
||||
dedupe: Optional[bool] = True,
|
||||
):
|
||||
results: List[SearchResponse] = []
|
||||
if q is None or q == "":
|
||||
logger.info(f"No query param (q) passed in API call to initiate search")
|
||||
logger.warn(f"No query param (q) passed in API call to initiate search")
|
||||
return results
|
||||
|
||||
# initialize variables
|
||||
user_query = q.strip()
|
||||
results_count = n
|
||||
score_threshold = score_threshold if score_threshold is not None else -math.inf
|
||||
|
||||
# return cached results, if available
|
||||
query_cache_key = f"{user_query}-{n}-{t}-{r}"
|
||||
query_cache_key = f"{user_query}-{n}-{t}-{r}-{score_threshold}-{dedupe}"
|
||||
if query_cache_key in state.query_cache:
|
||||
logger.info(f"Return response from query cache")
|
||||
logger.debug(f"Return response from query cache")
|
||||
return state.query_cache[query_cache_key]
|
||||
|
||||
if (t == SearchType.Org or t == None) and state.model.orgmode_search:
|
||||
# query org-mode notes
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(user_query, state.model.orgmode_search, rank_results=r)
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.orgmode_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
@@ -70,7 +101,9 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
||||
elif (t == SearchType.Markdown or t == None) and state.model.markdown_search:
|
||||
# query markdown files
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(user_query, state.model.markdown_search, rank_results=r)
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.markdown_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
@@ -79,7 +112,9 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
||||
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
||||
# query transactions
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(user_query, state.model.ledger_search, rank_results=r)
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.ledger_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
@@ -88,7 +123,9 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
||||
elif (t == SearchType.Music or t == None) and state.model.music_search:
|
||||
# query music library
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(user_query, state.model.music_search, rank_results=r)
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.music_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
@@ -97,7 +134,9 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
||||
elif (t == SearchType.Image or t == None) and state.model.image_search:
|
||||
# query images
|
||||
with timer("Query took", logger):
|
||||
hits = image_search.query(user_query, results_count, state.model.image_search)
|
||||
hits = image_search.query(
|
||||
user_query, results_count, state.model.image_search, score_threshold=score_threshold
|
||||
)
|
||||
output_directory = constants.web_directory / "images"
|
||||
|
||||
# collate and return results
|
||||
@@ -110,6 +149,22 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
|
||||
count=results_count,
|
||||
)
|
||||
|
||||
elif (t in SearchType or t == None) and state.model.plugin_search:
|
||||
# query specified plugin type
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(
|
||||
user_query,
|
||||
# Get plugin search model for specified search type, or the first one if none specified
|
||||
state.model.plugin_search.get(t.value) or next(iter(state.model.plugin_search.values())),
|
||||
rank_results=r,
|
||||
score_threshold=score_threshold,
|
||||
dedupe=dedupe,
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
# Cache results
|
||||
state.query_cache[query_cache_key] = results
|
||||
|
||||
@@ -126,7 +181,7 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False):
|
||||
logger.error(e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
else:
|
||||
logger.info("Search Index updated via API call")
|
||||
logger.info("📬 Search index updated via API")
|
||||
|
||||
try:
|
||||
state.processor_config = configure_processor(state.config.processor)
|
||||
@@ -134,6 +189,65 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False):
|
||||
logger.error(e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
else:
|
||||
logger.info("Processor reconfigured via API call")
|
||||
logger.info("📬 Processor reconfigured via API")
|
||||
|
||||
return {"status": "ok", "message": "khoj reloaded"}
|
||||
|
||||
|
||||
@api.get("/chat")
|
||||
def chat(q: Optional[str] = None):
|
||||
if (
|
||||
state.processor_config is None
|
||||
or state.processor_config.conversation is None
|
||||
or state.processor_config.conversation.openai_api_key is None
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Chat processor not configured. Configure OpenAI API key on server and restart it."
|
||||
)
|
||||
|
||||
# Initialize Variables
|
||||
api_key = state.processor_config.conversation.openai_api_key
|
||||
model = state.processor_config.conversation.model
|
||||
user_message_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# Load Conversation History
|
||||
chat_session = state.processor_config.conversation.chat_session
|
||||
meta_log = state.processor_config.conversation.meta_log
|
||||
|
||||
# If user query is empty, return chat history
|
||||
if not q:
|
||||
if meta_log.get("chat"):
|
||||
return {"status": "ok", "response": meta_log["chat"]}
|
||||
else:
|
||||
return {"status": "ok", "response": []}
|
||||
|
||||
# Infer search queries from user message
|
||||
with timer("Extracting search queries took", logger):
|
||||
inferred_queries = extract_questions(q, model=model, api_key=api_key, conversation_log=meta_log)
|
||||
|
||||
# Collate search results as context for GPT
|
||||
with timer("Searching knowledge base took", logger):
|
||||
result_list = []
|
||||
for query in inferred_queries:
|
||||
result_list.extend(search(query, n=5, r=True, score_threshold=-5.0, dedupe=False))
|
||||
compiled_references = [item.additional["compiled"] for item in result_list]
|
||||
|
||||
try:
|
||||
with timer("Generating chat response took", logger):
|
||||
gpt_response = converse(compiled_references, q, meta_log, api_key=api_key)
|
||||
status = "ok"
|
||||
except Exception as e:
|
||||
gpt_response = str(e)
|
||||
status = "error"
|
||||
|
||||
# Update Conversation History
|
||||
state.processor_config.conversation.chat_session = message_to_prompt(q, chat_session, gpt_message=gpt_response)
|
||||
state.processor_config.conversation.meta_log["chat"] = message_to_log(
|
||||
q,
|
||||
gpt_response,
|
||||
user_message_metadata={"created": user_message_time},
|
||||
khoj_message_metadata={"context": compiled_references, "intent": {"inferred-queries": inferred_queries}},
|
||||
conversation_log=meta_log.get("chat", []),
|
||||
)
|
||||
|
||||
return {"status": status, "response": gpt_response, "context": compiled_references}
|
||||
|
||||
@@ -1,24 +1,18 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
# External Packages
|
||||
import schedule
|
||||
from fastapi import APIRouter
|
||||
|
||||
# Internal Packages
|
||||
from khoj.routers.api import search
|
||||
from khoj.processor.conversation.gpt import (
|
||||
converse,
|
||||
answer,
|
||||
extract_search_type,
|
||||
message_to_log,
|
||||
message_to_prompt,
|
||||
understand,
|
||||
summarize,
|
||||
)
|
||||
from khoj.utils.config import SearchType
|
||||
from khoj.utils.helpers import get_from_dict, resolve_absolute_path
|
||||
from khoj.utils.state import SearchType
|
||||
from khoj.utils.helpers import get_from_dict
|
||||
from khoj.utils import state
|
||||
|
||||
|
||||
@@ -48,117 +42,23 @@ def search_beta(q: str, n: Optional[int] = 1):
|
||||
return {"status": "ok", "result": search_results, "type": search_type}
|
||||
|
||||
|
||||
@api_beta.get("/summarize")
|
||||
def summarize_beta(q: str):
|
||||
@api_beta.get("/answer")
|
||||
def answer_beta(q: str):
|
||||
# Initialize Variables
|
||||
model = state.processor_config.conversation.model
|
||||
api_key = state.processor_config.conversation.openai_api_key
|
||||
|
||||
# Load Conversation History
|
||||
chat_session = state.processor_config.conversation.chat_session
|
||||
meta_log = state.processor_config.conversation.meta_log
|
||||
# Collate context for GPT
|
||||
result_list = search(q, n=2, r=True, score_threshold=0, dedupe=False)
|
||||
collated_result = "\n\n".join([f"# {item.additional['compiled']}" for item in result_list])
|
||||
logger.debug(f"Reference Context:\n{collated_result}")
|
||||
|
||||
# Converse with OpenAI GPT
|
||||
result_list = search(q, n=1, r=True)
|
||||
collated_result = "\n".join([item.entry for item in result_list])
|
||||
logger.debug(f"Semantically Similar Notes:\n{collated_result}")
|
||||
# Make GPT respond to user query using provided context
|
||||
try:
|
||||
gpt_response = summarize(collated_result, summary_type="notes", user_query=q, model=model, api_key=api_key)
|
||||
gpt_response = answer(collated_result, user_query=q, model=model, api_key=api_key)
|
||||
status = "ok"
|
||||
except Exception as e:
|
||||
gpt_response = str(e)
|
||||
status = "error"
|
||||
|
||||
# Update Conversation History
|
||||
state.processor_config.conversation.chat_session = message_to_prompt(q, chat_session, gpt_message=gpt_response)
|
||||
state.processor_config.conversation.meta_log["chat"] = message_to_log(
|
||||
q, gpt_response, conversation_log=meta_log.get("chat", [])
|
||||
)
|
||||
|
||||
return {"status": status, "response": gpt_response}
|
||||
|
||||
|
||||
@api_beta.get("/chat")
|
||||
def chat(q: Optional[str] = None):
|
||||
# Initialize Variables
|
||||
model = state.processor_config.conversation.model
|
||||
api_key = state.processor_config.conversation.openai_api_key
|
||||
|
||||
# Load Conversation History
|
||||
chat_session = state.processor_config.conversation.chat_session
|
||||
meta_log = state.processor_config.conversation.meta_log
|
||||
|
||||
# If user query is empty, return chat history
|
||||
if not q:
|
||||
if meta_log.get("chat"):
|
||||
return {"status": "ok", "response": meta_log["chat"]}
|
||||
else:
|
||||
return {"status": "ok", "response": []}
|
||||
|
||||
# Converse with OpenAI GPT
|
||||
metadata = understand(q, model=model, api_key=api_key, verbose=state.verbose)
|
||||
logger.debug(f'Understood: {get_from_dict(metadata, "intent")}')
|
||||
|
||||
if get_from_dict(metadata, "intent", "memory-type") == "notes":
|
||||
query = get_from_dict(metadata, "intent", "query")
|
||||
result_list = search(query, n=1, t=SearchType.Org, r=True)
|
||||
collated_result = "\n".join([item.entry for item in result_list])
|
||||
logger.debug(f"Semantically Similar Notes:\n{collated_result}")
|
||||
try:
|
||||
gpt_response = summarize(collated_result, summary_type="notes", user_query=q, model=model, api_key=api_key)
|
||||
status = "ok"
|
||||
except Exception as e:
|
||||
gpt_response = str(e)
|
||||
status = "error"
|
||||
else:
|
||||
try:
|
||||
gpt_response = converse(q, model, chat_session, api_key=api_key)
|
||||
status = "ok"
|
||||
except Exception as e:
|
||||
gpt_response = str(e)
|
||||
status = "error"
|
||||
|
||||
# Update Conversation History
|
||||
state.processor_config.conversation.chat_session = message_to_prompt(q, chat_session, gpt_message=gpt_response)
|
||||
state.processor_config.conversation.meta_log["chat"] = message_to_log(
|
||||
q, gpt_response, metadata, meta_log.get("chat", [])
|
||||
)
|
||||
|
||||
return {"status": status, "response": gpt_response}
|
||||
|
||||
|
||||
@schedule.repeat(schedule.every(5).minutes)
|
||||
def save_chat_session():
|
||||
# No need to create empty log file
|
||||
if not (
|
||||
state.processor_config
|
||||
and state.processor_config.conversation
|
||||
and state.processor_config.conversation.meta_log
|
||||
and state.processor_config.conversation.chat_session
|
||||
):
|
||||
return
|
||||
|
||||
# Summarize Conversation Logs for this Session
|
||||
chat_session = state.processor_config.conversation.chat_session
|
||||
openai_api_key = state.processor_config.conversation.openai_api_key
|
||||
conversation_log = state.processor_config.conversation.meta_log
|
||||
model = state.processor_config.conversation.model
|
||||
session = {
|
||||
"summary": summarize(chat_session, summary_type="chat", model=model, api_key=openai_api_key),
|
||||
"session-start": conversation_log.get("session", [{"session-end": 0}])[-1]["session-end"],
|
||||
"session-end": len(conversation_log["chat"]),
|
||||
}
|
||||
if "session" in conversation_log:
|
||||
conversation_log["session"].append(session)
|
||||
else:
|
||||
conversation_log["session"] = [session]
|
||||
logger.info("Added new chat session to conversation logs")
|
||||
|
||||
# Save Conversation Metadata Logs to Disk
|
||||
conversation_logfile = resolve_absolute_path(state.processor_config.conversation.conversation_logfile)
|
||||
conversation_logfile.parent.mkdir(parents=True, exist_ok=True) # create conversation directory if doesn't exist
|
||||
with open(conversation_logfile, "w+", encoding="utf-8") as logfile:
|
||||
json.dump(conversation_log, logfile)
|
||||
|
||||
state.processor_config.conversation.chat_session = None
|
||||
logger.info("Saved updated conversation logs to disk.")
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# Standard Packages
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from datetime import timedelta, datetime
|
||||
@@ -41,6 +40,9 @@ class DateFilter(BaseFilter):
|
||||
date_in_entry = datetime.strptime(date_in_entry_string, "%Y-%m-%d").timestamp()
|
||||
except ValueError:
|
||||
continue
|
||||
except OSError:
|
||||
logger.debug(f"OSError: Ignoring unprocessable date in entry: {date_in_entry_string}")
|
||||
continue
|
||||
self.date_to_entry_ids[date_in_entry].add(id)
|
||||
|
||||
def can_filter(self, raw_query):
|
||||
@@ -64,7 +66,7 @@ class DateFilter(BaseFilter):
|
||||
# return results from cache if exists
|
||||
cache_key = tuple(query_daterange)
|
||||
if cache_key in self.cache:
|
||||
logger.info(f"Return date filter results from cache")
|
||||
logger.debug(f"Return date filter results from cache")
|
||||
entries_to_include = self.cache[cache_key]
|
||||
return query, entries_to_include
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# Standard Packages
|
||||
import re
|
||||
import fnmatch
|
||||
import time
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
@@ -49,7 +48,7 @@ class FileFilter(BaseFilter):
|
||||
query = re.sub(self.file_filter_regex, "", query).strip()
|
||||
cache_key = tuple(files_to_search)
|
||||
if cache_key in self.cache:
|
||||
logger.info(f"Return file filter results from cache")
|
||||
logger.debug(f"Return file filter results from cache")
|
||||
included_entry_indices = self.cache[cache_key]
|
||||
return query, included_entry_indices
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# Standard Packages
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
@@ -58,7 +57,7 @@ class WordFilter(BaseFilter):
|
||||
# Return item from cache if exists
|
||||
cache_key = tuple(sorted(required_words)), tuple(sorted(blocked_words))
|
||||
if cache_key in self.cache:
|
||||
logger.info(f"Return word filter results from cache")
|
||||
logger.debug(f"Return word filter results from cache")
|
||||
included_entry_indices = self.cache[cache_key]
|
||||
return query, included_entry_indices
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Standard Packages
|
||||
import glob
|
||||
import math
|
||||
import pathlib
|
||||
import copy
|
||||
import shutil
|
||||
import time
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
@@ -50,9 +50,9 @@ def extract_entries(image_directories):
|
||||
image_names.extend(list(image_directory.glob("*.jpg")))
|
||||
image_names.extend(list(image_directory.glob("*.jpeg")))
|
||||
|
||||
if logger.level >= logging.INFO:
|
||||
if logger.level >= logging.DEBUG:
|
||||
image_directory_names = ", ".join([str(image_directory) for image_directory in image_directories])
|
||||
logger.info(f"Found {len(image_names)} images in {image_directory_names}")
|
||||
logger.debug(f"Found {len(image_names)} images in {image_directory_names}")
|
||||
return sorted(image_names)
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ def compute_image_embeddings(image_names, encoder, embeddings_file, batch_size=5
|
||||
# Load pre-computed image embeddings from file if exists
|
||||
if resolve_absolute_path(embeddings_file).exists() and not regenerate:
|
||||
image_embeddings = torch.load(embeddings_file)
|
||||
logger.info(f"Loaded {len(image_embeddings)} image embeddings from {embeddings_file}")
|
||||
logger.debug(f"Loaded {len(image_embeddings)} image embeddings from {embeddings_file}")
|
||||
# Else compute the image embeddings from scratch, which can take a while
|
||||
else:
|
||||
image_embeddings = []
|
||||
@@ -89,7 +89,7 @@ def compute_image_embeddings(image_names, encoder, embeddings_file, batch_size=5
|
||||
|
||||
# Save computed image embeddings to file
|
||||
torch.save(image_embeddings, embeddings_file)
|
||||
logger.info(f"Saved computed embeddings to {embeddings_file}")
|
||||
logger.info(f"📩 Saved computed image embeddings to {embeddings_file}")
|
||||
|
||||
return image_embeddings
|
||||
|
||||
@@ -102,7 +102,7 @@ def compute_metadata_embeddings(
|
||||
# Load pre-computed image metadata embedding file if exists
|
||||
if use_xmp_metadata and resolve_absolute_path(f"{embeddings_file}_metadata").exists() and not regenerate:
|
||||
image_metadata_embeddings = torch.load(f"{embeddings_file}_metadata")
|
||||
logger.info(f"Loaded pre-computed embeddings from {embeddings_file}_metadata")
|
||||
logger.debug(f"Loaded image metadata embeddings from {embeddings_file}_metadata")
|
||||
|
||||
# Else compute the image metadata embeddings from scratch, which can take a while
|
||||
if use_xmp_metadata and image_metadata_embeddings is None:
|
||||
@@ -121,7 +121,7 @@ def compute_metadata_embeddings(
|
||||
)
|
||||
continue
|
||||
torch.save(image_metadata_embeddings, f"{embeddings_file}_metadata")
|
||||
logger.info(f"Saved computed metadata embeddings to {embeddings_file}_metadata")
|
||||
logger.info(f"📩 Saved computed image metadata embeddings to {embeddings_file}_metadata")
|
||||
|
||||
return image_metadata_embeddings
|
||||
|
||||
@@ -143,18 +143,18 @@ def extract_metadata(image_name):
|
||||
return image_processed_metadata
|
||||
|
||||
|
||||
def query(raw_query, count, model: ImageSearchModel):
|
||||
def query(raw_query, count, model: ImageSearchModel, score_threshold: float = -math.inf):
|
||||
# Set query to image content if query is of form file:/path/to/file.png
|
||||
if raw_query.startswith("file:") and pathlib.Path(raw_query[5:]).is_file():
|
||||
query_imagepath = resolve_absolute_path(pathlib.Path(raw_query[5:]), strict=True)
|
||||
query = copy.deepcopy(Image.open(query_imagepath))
|
||||
query.thumbnail((640, query.height)) # scale down image for faster processing
|
||||
logger.info(f"Find Images by Image: {query_imagepath}")
|
||||
logger.info(f"🔎 Find Images by Image: {query_imagepath}")
|
||||
else:
|
||||
# Truncate words in query to stay below max_tokens supported by ML model
|
||||
max_words = 20
|
||||
query = " ".join(raw_query.split()[:max_words])
|
||||
logger.info(f"Find Images by Text: {query}")
|
||||
logger.info(f"🔎 Find Images by Text: {query}")
|
||||
|
||||
# Now we encode the query (which can either be an image or a text string)
|
||||
with timer("Query Encode Time", logger):
|
||||
@@ -199,6 +199,9 @@ def query(raw_query, count, model: ImageSearchModel):
|
||||
for corpus_id, scores in image_hits.items()
|
||||
]
|
||||
|
||||
# Filter results by score threshold
|
||||
hits = [hit for hit in hits if hit["image_score"] >= score_threshold]
|
||||
|
||||
# Sort the images based on their combined metadata, image scores
|
||||
return sorted(hits, key=lambda hit: hit["score"], reverse=True)
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Standard Packages
|
||||
import logging
|
||||
import math
|
||||
from pathlib import Path
|
||||
import time
|
||||
from typing import List, Tuple, Type
|
||||
|
||||
# External Packages
|
||||
@@ -68,7 +68,7 @@ def compute_embeddings(
|
||||
# Load pre-computed embeddings from file if exists and update them if required
|
||||
if embeddings_file.exists() and not regenerate:
|
||||
corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
|
||||
logger.info(f"Loaded embeddings from {embeddings_file}")
|
||||
logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}")
|
||||
|
||||
# Encode any new entries in the corpus and update corpus embeddings
|
||||
new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
|
||||
@@ -95,12 +95,18 @@ def compute_embeddings(
|
||||
if new_entries:
|
||||
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
|
||||
torch.save(corpus_embeddings, embeddings_file)
|
||||
logger.info(f"Computed embeddings and saved them to {embeddings_file}")
|
||||
logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
|
||||
|
||||
return corpus_embeddings
|
||||
|
||||
|
||||
def query(raw_query: str, model: TextSearchModel, rank_results: bool = False) -> Tuple[List[dict], List[Entry]]:
|
||||
def query(
|
||||
raw_query: str,
|
||||
model: TextSearchModel,
|
||||
rank_results: bool = False,
|
||||
score_threshold: float = -math.inf,
|
||||
dedupe: bool = True,
|
||||
) -> Tuple[List[dict], List[Entry]]:
|
||||
"Search for entries that answer the query"
|
||||
query, entries, corpus_embeddings = raw_query, model.entries, model.corpus_embeddings
|
||||
|
||||
@@ -130,11 +136,15 @@ def query(raw_query: str, model: TextSearchModel, rank_results: bool = False) ->
|
||||
if rank_results:
|
||||
hits = cross_encoder_score(model.cross_encoder, query, entries, hits)
|
||||
|
||||
# Filter results by score threshold
|
||||
hits = [hit for hit in hits if hit.get("cross-score", hit.get("score")) >= score_threshold]
|
||||
|
||||
# Order results by cross-encoder score followed by bi-encoder score
|
||||
hits = sort_results(rank_results, hits)
|
||||
|
||||
# Deduplicate entries by raw entry text before showing to users
|
||||
hits = deduplicate_results(entries, hits)
|
||||
if dedupe:
|
||||
hits = deduplicate_results(entries, hits)
|
||||
|
||||
return hits, entries
|
||||
|
||||
@@ -144,7 +154,7 @@ def collate_results(hits, entries: List[Entry], count=5) -> List[SearchResponse]
|
||||
SearchResponse.parse_obj(
|
||||
{
|
||||
"entry": entries[hit["corpus_id"]].raw,
|
||||
"score": f"{hit['cross-score'] if 'cross-score' in hit else hit['score']:.3f}",
|
||||
"score": f"{hit.get('cross-score', 'score')}:.3f",
|
||||
"additional": {"file": entries[hit["corpus_id"]].file, "compiled": entries[hit["corpus_id"]].compiled},
|
||||
}
|
||||
)
|
||||
|
||||
@@ -3,7 +3,7 @@ from __future__ import annotations # to avoid quoting type hints
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, List
|
||||
from typing import TYPE_CHECKING, Dict, List
|
||||
|
||||
# External Packages
|
||||
import torch
|
||||
@@ -62,6 +62,7 @@ class SearchModels:
|
||||
music_search: TextSearchModel = None
|
||||
markdown_search: TextSearchModel = None
|
||||
image_search: ImageSearchModel = None
|
||||
plugin_search: Dict[str, TextSearchModel] = None
|
||||
|
||||
|
||||
class ConversationProcessorConfigModel:
|
||||
|
||||
@@ -56,6 +56,7 @@ default_config = {
|
||||
"processor": {
|
||||
"conversation": {
|
||||
"openai-api-key": None,
|
||||
"model": "text-davinci-003",
|
||||
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
|
||||
}
|
||||
},
|
||||
|
||||
@@ -58,16 +58,20 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
|
||||
def load_model(model_name: str, model_type, model_dir=None, device: str = None) -> Union[BaseEncoder, CrossEncoder]:
|
||||
"Load model from disk or huggingface"
|
||||
# Construct model path
|
||||
logger = logging.getLogger(__name__)
|
||||
model_path = join(model_dir, model_name.replace("/", "_")) if model_dir is not None else None
|
||||
|
||||
# Load model from model_path if it exists there
|
||||
model_type_class = get_class_by_name(model_type) if isinstance(model_type, str) else model_type
|
||||
if model_path is not None and resolve_absolute_path(model_path).exists():
|
||||
logger.debug(f"Loading {model_name} model from disk")
|
||||
model = model_type_class(get_absolute_path(model_path), device=device)
|
||||
# Else load the model from the model_name
|
||||
else:
|
||||
logger.info(f"🤖 Downloading {model_name} model from web")
|
||||
model = model_type_class(model_name, device=device)
|
||||
if model_path is not None:
|
||||
logger.info(f"📩 Saved {model_name} model to disk")
|
||||
model.save(model_path)
|
||||
|
||||
return model
|
||||
|
||||
@@ -31,7 +31,7 @@ def load_jsonl(input_path):
|
||||
jsonl_file.close()
|
||||
|
||||
# Log JSONL entries loaded
|
||||
logger.info(f"Loaded {len(data)} records from {input_path}")
|
||||
logger.debug(f"Loaded {len(data)} records from {input_path}")
|
||||
|
||||
return data
|
||||
|
||||
@@ -44,7 +44,7 @@ def dump_jsonl(jsonl_data, output_path):
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(jsonl_data)
|
||||
|
||||
logger.info(f"Wrote jsonl data to {output_path}")
|
||||
logger.debug(f"Wrote jsonl data to {output_path}")
|
||||
|
||||
|
||||
def compress_jsonl_data(jsonl_data, output_path):
|
||||
@@ -54,4 +54,4 @@ def compress_jsonl_data(jsonl_data, output_path):
|
||||
with gzip.open(output_path, "wt", encoding="utf-8") as gzip_file:
|
||||
gzip_file.write(jsonl_data)
|
||||
|
||||
logger.info(f"Wrote jsonl data to gzip compressed jsonl at {output_path}")
|
||||
logger.debug(f"Wrote jsonl data to gzip compressed jsonl at {output_path}")
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# System Packages
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
# External Packages
|
||||
from pydantic import BaseModel, validator
|
||||
@@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
|
||||
image: Optional[ImageContentConfig]
|
||||
music: Optional[TextContentConfig]
|
||||
markdown: Optional[TextContentConfig]
|
||||
plugins: Optional[Dict[str, TextContentConfig]]
|
||||
|
||||
|
||||
class TextSearchConfig(ConfigBase):
|
||||
|
||||
@@ -8,6 +8,7 @@ import torch
|
||||
from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import config as utils_config
|
||||
from khoj.utils.config import SearchModels, ProcessorConfigModel
|
||||
from khoj.utils.helpers import LRU
|
||||
from khoj.utils.rawconfig import FullConfig
|
||||
@@ -23,6 +24,7 @@ port: int = None
|
||||
cli_args: List[str] = None
|
||||
query_cache = LRU()
|
||||
search_index_lock = threading.Lock()
|
||||
SearchType = utils_config.SearchType
|
||||
|
||||
if torch.cuda.is_available():
|
||||
# Use CUDA GPU
|
||||
|
||||
@@ -1,19 +1,28 @@
|
||||
# External Packages
|
||||
import os
|
||||
from copy import deepcopy
|
||||
from fastapi.testclient import TestClient
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
# Internal Packages
|
||||
from khoj.main import app
|
||||
from khoj.configure import configure_processor, configure_routes, configure_search_types
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
from khoj.utils.rawconfig import (
|
||||
ContentConfig,
|
||||
ConversationProcessorConfig,
|
||||
ProcessorConfig,
|
||||
TextContentConfig,
|
||||
ImageContentConfig,
|
||||
SearchConfig,
|
||||
TextSearchConfig,
|
||||
ImageSearchConfig,
|
||||
)
|
||||
from khoj.utils import state
|
||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from khoj.search_filter.word_filter import WordFilter
|
||||
@@ -64,16 +73,96 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
|
||||
content_config.org = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/org/*.org"],
|
||||
compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
|
||||
compressed_jsonl=content_dir.joinpath("notes.jsonl"),
|
||||
embeddings_file=content_dir.joinpath("note_embeddings.pt"),
|
||||
)
|
||||
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||
|
||||
content_config.plugins = {
|
||||
"plugin1": TextContentConfig(
|
||||
input_files=[content_dir.joinpath("notes.jsonl")],
|
||||
input_filter=None,
|
||||
compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
|
||||
)
|
||||
}
|
||||
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(
|
||||
JsonlToJsonl, content_config.plugins["plugin1"], search_config.asymmetric, regenerate=False, filters=filters
|
||||
)
|
||||
|
||||
return content_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def md_content_config(tmp_path_factory):
|
||||
content_dir = tmp_path_factory.mktemp("content")
|
||||
|
||||
# Generate Embeddings for Markdown Content
|
||||
content_config = ContentConfig()
|
||||
content_config.markdown = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/markdown/*.md"],
|
||||
compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
|
||||
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
||||
)
|
||||
|
||||
return content_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def processor_config(tmp_path_factory):
|
||||
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||
processor_dir = tmp_path_factory.mktemp("processor")
|
||||
|
||||
# The conversation processor is the only configured processor
|
||||
# It needs an OpenAI API key to work.
|
||||
if not openai_api_key:
|
||||
return
|
||||
|
||||
# Setup conversation processor, if OpenAI API key is set
|
||||
processor_config = ProcessorConfig()
|
||||
processor_config.conversation = ConversationProcessorConfig(
|
||||
openai_api_key=openai_api_key,
|
||||
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
||||
)
|
||||
|
||||
return processor_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig):
|
||||
# Initialize app state
|
||||
state.config.content_type = md_content_config
|
||||
state.config.search_type = search_config
|
||||
state.SearchType = configure_search_types(state.config)
|
||||
|
||||
# Index Markdown Content for Search
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
state.model.markdown_search = text_search.setup(
|
||||
MarkdownToJsonl, md_content_config.markdown, search_config.asymmetric, regenerate=False, filters=filters
|
||||
)
|
||||
|
||||
# Initialize Processor from Config
|
||||
state.processor_config = configure_processor(processor_config)
|
||||
|
||||
configure_routes(app)
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def client(content_config: ContentConfig, search_config: SearchConfig, processor_config: ProcessorConfig):
|
||||
state.config.content_type = content_config
|
||||
state.config.search_type = search_config
|
||||
state.SearchType = configure_search_types(state.config)
|
||||
|
||||
configure_routes(app)
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def new_org_file(content_config: ContentConfig):
|
||||
# Setup
|
||||
|
||||
@@ -6,6 +6,17 @@ content-type:
|
||||
embeddings-file: ".note_embeddings.pt"
|
||||
index-header-entries: true
|
||||
|
||||
plugins:
|
||||
content_plugin_1:
|
||||
input-files: [ "content_plugin_1_new.jsonl.gz" ]
|
||||
compressed-jsonl: "content_plugin_1.jsonl.gz"
|
||||
embeddings-file: "content_plugin_1_embeddings.pt"
|
||||
|
||||
content_plugin_2:
|
||||
input-filter: [ "*2_new.jsonl.gz" ]
|
||||
compressed-jsonl: "content_plugin_2.jsonl.gz"
|
||||
embeddings-file: "content_plugin_2_embeddings.pt"
|
||||
|
||||
search-type:
|
||||
asymmetric:
|
||||
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
|
||||
|
||||
8
tests/data/markdown/Birthday Gift for Xiu turning 4.md
Normal file
8
tests/data/markdown/Birthday Gift for Xiu turning 4.md
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
---
|
||||
SCHEDULED: 2014-03-27
|
||||
---
|
||||
|
||||
Went to Araam Bazaar to get a gift for Xiu's birthday.
|
||||
|
||||
Xi and Fang have booked the Taifeld Rooftop Restaurant for the evening of April 1st.
|
||||
8
tests/data/markdown/Hike Mt. Kilimanjaro.md
Normal file
8
tests/data/markdown/Hike Mt. Kilimanjaro.md
Normal file
@@ -0,0 +1,8 @@
|
||||
---
|
||||
SCHEDULED: 2022-03-10 05:00-18:00
|
||||
---
|
||||
Hiked Mt. Kilimanjaro with Pablo and his son Amal (is 17 now!). It was an incredible experience. Pablo is an experienced mountaineer, having climbed Mt. Kilimanjaro twice before, and Amal was taking on the challenge for the first time.
|
||||
|
||||
We set off at 5:10 AM in the morning and made it to the summit just before sunset at 17:45. The last 2 hours were very steep and strenuous, especially with the strong winds. Pablo said he hadn't experienced such winds during his previous 2 hikes.
|
||||
|
||||
But once we reached the summit the views were breathtaking and the feeling of accomplishment at the top was priceless.
|
||||
14
tests/data/markdown/Meet Arun and Pablo for Lunch.md
Normal file
14
tests/data/markdown/Meet Arun and Pablo for Lunch.md
Normal file
@@ -0,0 +1,14 @@
|
||||
---
|
||||
SCHEDULED: 2023-04-01
|
||||
CLOSED: 2023-04-01
|
||||
---
|
||||
|
||||
Met Pablo and Arun for Lunch at Arak, Medellin.
|
||||
|
||||
Arun just sold his apartment in Nairobi and is moving with his wife to Medellin in April 2023!
|
||||
|
||||
Pablo mentioned his son Amal just got admission into the Colegio Superior de Gastronomia in Mexico City. Last of his 3 kids to leave the nest!
|
||||
|
||||
|
||||
2023-04-01 "Arak" "Dosa for Lunch"
|
||||
Expenses:Food:Dining 11.00 USD
|
||||
9
tests/data/markdown/Miscellaneous Transactions.md
Normal file
9
tests/data/markdown/Miscellaneous Transactions.md
Normal file
@@ -0,0 +1,9 @@
|
||||
|
||||
2023-04-01 "Naco Taco" "Tacos for Dinner"
|
||||
Expenses:Food:Dining 7.00 USD
|
||||
|
||||
2020-04-01 "SuperMercado" "Bananas"
|
||||
Expenses:Food:Groceries 3.00 USD
|
||||
|
||||
2023-03-01 "Naco Taco" "Burittos for Dinner"
|
||||
Expenses:Food:Dining 5.00 USD
|
||||
3
tests/data/markdown/Namita.md
Normal file
3
tests/data/markdown/Namita.md
Normal file
@@ -0,0 +1,3 @@
|
||||
Namita is married to Dhruva. They have 2 sons, Harshal and Gaurav.
|
||||
She turned 30 on March 5th 2022.
|
||||
She runs a Pharmacy at Ramnath street in Kolkata.
|
||||
34
tests/data/markdown/Patent 6631372.md
Normal file
34
tests/data/markdown/Patent 6631372.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# 058 6,631,372
|
||||
|
||||
March 2006, rev August 2009
|
||||
|
||||
A couple days ago I found to my surprise that I'd been granted a [patent](http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=6,631,372.PN.&OS=PN/6,631,372&RS=PN/6,631,372).
|
||||
It issued in 2003, but no one told me. I wouldn't know about it now except that a few months ago, while visiting Yahoo, I happened to run into a Big Cheese I knew from working there in the late nineties. He brought up something called Revenue Loop, which Viaweb had been working on when they bought us.
|
||||
|
||||
The idea is basically that you sort search results not in order of textual "relevance" (as search engines did then) nor in order of how much advertisers bid (as Overture did) but in order of the bid times the number of transactions. Ordinarily you'd do this for shopping searches, though in fact one of the features of our scheme is that it automatically detects which searches are shopping searches.
|
||||
|
||||
If you just order the results in order of bids, you can make the search results useless, because the first results could be dominated by lame sites that had bid the most. But if you order results by bid multiplied by transactions, far from selling out, you're getting a _better_ measure of relevance. What could be a better sign that someone was satisfied with a search result than going to the site and buying something?
|
||||
|
||||
And, of course, this algorithm automatically maximizes the revenue of the search engine.
|
||||
|
||||
Everyone is focused on this type of approach now, but few were in 1998\. In 1998 it was all about selling banner ads. We didn't know that, so we were pretty excited when we figured out what seemed to us the optimal way of doing shopping searches.
|
||||
|
||||
When Yahoo was thinking of buying us, we had a meeting with Jerry Yang in New York. For him, I now realize, this was supposed to be one of those meetings when you check out a company you've pretty much decided to buy, just to make sure they're ok guys. We weren't expected to do more than chat and seem smart and reasonable. He must have been dismayed when I jumped up to the whiteboard and launched into a presentation of our exciting new technology.
|
||||
|
||||
I was just as dismayed when he didn't seem to care at all about it. At the time I thought, "boy, is this guy poker-faced. We present to him what has to be the optimal way of sorting product search results, and he's not even curious." I didn't realize till much later why he didn't care. In 1998, advertisers were overpaying enormously for ads on web sites. In 1998, if advertisers paid the maximum that traffic was worth to them, Yahoo's revenues would have _decreased._
|
||||
|
||||
Things are different now, of course. Now this sort of thing is all the rage. So when I ran into the Yahoo exec I knew from the old days in the Yahoo cafeteria a few months ago, the first thing he remembered was not (fortunately) all the fights I had with him, but Revenue Loop.
|
||||
|
||||
"Well," I said, "I think we actually applied for a patent on it. I'm not sure what happened to the application after I left."
|
||||
|
||||
"Really? That would be an important patent."
|
||||
|
||||
So someone investigated, and sure enough, that patent application had continued in the pipeline for several years after, and finally issued in 2003.
|
||||
|
||||
The main thing that struck me on reading it, actually, is that lawyers at some point messed up my nice clear writing. Some clever person with a spell checker reduced one section to Zen-like incomprehensibility:
|
||||
|
||||
> Also, common spelling errors will tend to get fixed. For example, if users > searching for "compact disc player" end up spending considerable money at > sites offering compact disc players, then those pages will have a higher > relevance for that search phrase, even though the phrase "compact disc > player" is not present on those pages.
|
||||
|
||||
(That "compat disc player" wasn't a typo, guys.)
|
||||
|
||||
For the fine prose of the original, see the provisional application of February 1998, back when we were still Viaweb and couldn't afford to pay lawyers to turn every "a lot of" into "considerable."
|
||||
14
tests/data/markdown/Preparing to File Taxes for 2022.md
Normal file
14
tests/data/markdown/Preparing to File Taxes for 2022.md
Normal file
@@ -0,0 +1,14 @@
|
||||
---
|
||||
SCHEDULED: 2022-03-10
|
||||
CLOSED: 2022-03-31
|
||||
---
|
||||
As annoying as it is, start early in preparing for this.
|
||||
|
||||
Tips for next time to help get ready for filing your taxes:
|
||||
- *Gather all your documents*: Start collecting the documents you’ll need to file your taxes, such as your W-2, 1099, 1098, and other income statements.
|
||||
- *Know your filing status*: Your filing status can determine which forms you need to fill out, so make sure you know which one applies to you.
|
||||
- *Check for credits and deductions*: Check to see if you qualify for any credits or deductions that could reduce your tax liability.
|
||||
- *Deadlines*: Have to file by March 31st of the Calendar year.
|
||||
- *Choose the right tax preparer*: Went with Kamal this time as he'd done my taxes last year. But feel like can get better deal if I shop around before next time.
|
||||
|
||||
By taking the time to prepare for filing your taxes early, you can make sure that you don’t miss any important deadlines or details and get the most out
|
||||
11
tests/data/markdown/Sign Wayne Enterprises Offer Letter.md
Normal file
11
tests/data/markdown/Sign Wayne Enterprises Offer Letter.md
Normal file
@@ -0,0 +1,11 @@
|
||||
---
|
||||
2005-01-10
|
||||
---
|
||||
|
||||
Awesome start to the new year! Finally got an offer from Wayne Enterprise after 2 months of job hunting.
|
||||
|
||||
I hadn't heard of them before for some reason. But I'm going to be doing R&D work to make Bomb proof jackets and Invisibility cloaks, which sounds awesome!
|
||||
|
||||
Plus when I asked who their customers were, they said (during the interview) that the work they do is meant for Charity and to help the downtrodden people of the city. They do not sell such products to the military/police. So that allayed my fears of working on stuff that would cause negative impact on society. That's the reason I left Acme Inc. in the first place!
|
||||
|
||||
The salary is great, $40K! But I'll have to move to Gotham. I'd have preferred to be near nature but everyone is saying it's better to try Gotham out when you're younger. 🤷🏾
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
2004-10-02
|
||||
---
|
||||
|
||||
It's a little emotional as this was my first real job. Been working at Acme Inc for (exactly) 4 years now. Had a going away party and farewell chats with a bunch of folks like Rita, and Yan.
|
||||
|
||||
I'll be looking for jobs better aligned with my values going forward.
|
||||
10
tests/data/markdown/Visit Seregenti.md
Normal file
10
tests/data/markdown/Visit Seregenti.md
Normal file
@@ -0,0 +1,10 @@
|
||||
---
|
||||
SCHEDULED: <2022-03-05-2022-03-09>
|
||||
CLOSED: [2022-03-10]
|
||||
---
|
||||
Went to Serengeti with Pablo, Namita and all our immediate families. So 9 of us!
|
||||
Saw lions, heynas and cheetahs. Had a swim in the Savannah falls.
|
||||
|
||||
The kids enjoyed scaring the Zebras from the Safari jeeps but the driver was not happy about that.
|
||||
|
||||
Had dinner at Rakshas Lounge, a chic restaurant in the middle of the Serengeti (wierd juxtaposition).
|
||||
3
tests/data/markdown/Xi Li.md
Normal file
3
tests/data/markdown/Xi Li.md
Normal file
@@ -0,0 +1,3 @@
|
||||
Xi's Birthday is on 1st Jan 1980 in Fujiang. He is married to Fang.
|
||||
He works in the construction industry building tractors.
|
||||
The daughter of Xi and Fang is Xiu, she was born on April 1st 2010.
|
||||
41
tests/data/markdown/copy_what_you_like.md
Normal file
41
tests/data/markdown/copy_what_you_like.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# 066 Copy What You Like
|
||||
|
||||
[](https://sep.yimg.com/ca/I/paulgraham_2202_8480086)
|
||||
|
||||
July 2006
|
||||
|
||||
When I was in high school I spent a lot of time imitating bad writers. What we studied in English classes was mostly fiction, so I assumed that was the highest form of writing. Mistake number one. The stories that seemed to be most admired were ones in which people suffered in complicated ways. Anything funny or gripping was ipso facto suspect, unless it was old enough to be hard to understand, like Shakespeare or Chaucer. Mistake number two. The ideal medium seemed the short story, which I've since learned had quite a brief life, roughly coincident with the peak of magazine publishing. But since their size made them perfect for use in high school classes, we read a lot of them, which gave us the impression the short story was flourishing. Mistake number three. And because they were so short, nothing really had to happen; you could just show a randomly truncated slice of life, and that was considered advanced. Mistake number four. The result was that I wrote a lot of stories in which nothing happened except that someone was unhappy in a way that seemed deep.
|
||||
|
||||
For most of college I was a philosophy major. I was very impressed by the papers published in philosophy journals. They were so beautifully typeset, and their tone was just captivating--alternately casual and buffer-overflowingly technical. A fellow would be walking along a street and suddenly modality qua modality would spring upon him. I didn't ever quite understand these papers, but I figured I'd get around to that later, when I had time to reread them more closely. In the meantime I tried my best to imitate them. This was, I can now see, a doomed undertaking, because they weren't really saying anything. No philosopher ever refuted another, for example, because no one said anything definite enough to refute. Needless to say, my imitations didn't say anything either.
|
||||
|
||||
In grad school I was still wasting time imitating the wrong things. There was then a fashionable type of program called an expert system, at the core of which was something called an inference engine. I looked at what these things did and thought "I could write that in a thousand lines of code." And yet eminent professors were writing books about them, and startups were selling them for a year's salary a copy. What an opportunity, I thought; these impressive things seem easy to me; I must be pretty sharp. Wrong. It was simply a fad. The books the professors wrote about expert systems are now ignored. They were not even on a _path_ to anything interesting. And the customers paying so much for them were largely the same government agencies that paid thousands for screwdrivers and toilet seats.
|
||||
|
||||
How do you avoid copying the wrong things? Copy only what you genuinely like. That would have saved me in all three cases. I didn't enjoy the short stories we had to read in English classes; I didn't learn anything from philosophy papers; I didn't use expert systems myself. I believed these things were good because they were admired.
|
||||
|
||||
It can be hard to separate the things you like from the things you're impressed with. One trick is to ignore presentation. Whenever I see a painting impressively hung in a museum, I ask myself: how much would I pay for this if I found it at a garage sale, dirty and frameless, and with no idea who painted it? If you walk around a museum trying this experiment, you'll find you get some truly startling results. Don't ignore this data point just because it's an outlier.
|
||||
|
||||
Another way to figure out what you like is to look at what you enjoy as guilty pleasures. Many things people like, especially if they're young and ambitious, they like largely for the feeling of virtue in liking them. 99% of people reading _Ulysses_ are thinking "I'm reading _Ulysses_ " as they do it. A guilty pleasure is at least a pure one. What do you read when you don't feel up to being virtuous? What kind of book do you read and feel sad that there's only half of it left, instead of being impressed that you're half way through? That's what you really like.
|
||||
|
||||
Even when you find genuinely good things to copy, there's another pitfall to be avoided. Be careful to copy what makes them good, rather than their flaws. It's easy to be drawn into imitating flaws, because they're easier to see, and of course easier to copy too. For example, most painters in the eighteenth and nineteenth centuries used brownish colors. They were imitating the great painters of the Renaissance, whose paintings by that time were brown with dirt. Those paintings have since been cleaned, revealing brilliant colors; their imitators are of course still brown.
|
||||
|
||||
It was painting, incidentally, that cured me of copying the wrong things. Halfway through grad school I decided I wanted to try being a painter, and the art world was so manifestly corrupt that it snapped the leash of credulity. These people made philosophy professors seem as scrupulous as mathematicians. It was so clearly a choice of doing good work xor being an insider that I was forced to see the distinction. It's there to some degree in almost every field, but I had till then managed to avoid facing it.
|
||||
|
||||
That was one of the most valuable things I learned from painting: you have to figure out for yourself what's [good](taste.html). You can't trust authorities. They'll lie to you on this one.
|
||||
|
||||
---
|
||||
|
||||
[](http://reddit.com) [ Comment](http://reddit.com/info/9bm4/comments) on this essay.
|
||||
|
||||
---
|
||||
|
||||
[Chinese Translation](http://mailper.googlepages.com/copywhatyoulike)
|
||||
|
||||
[Romanian Translation](http://ro.goobix.com/pg/copy/)
|
||||
|
||||
|
||||
|
||||
[Spanish Translation](http://www.simpleoption.com/ensayo-copia-lo-que-te- gusta)
|
||||
|
||||
[Russian Translation](http://ryba4.com/translations/copy)
|
||||
|
||||
* * *
|
||||
68
tests/data/markdown/having_kids.md
Normal file
68
tests/data/markdown/having_kids.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# 183 Having Kids
|
||||
|
||||
|
||||
|
||||
December 2019
|
||||
|
||||
Before I had kids, I was afraid of having kids. Up to that point I felt about kids the way the young Augustine felt about living virtuously. I'd have been sad to think I'd never have children. But did I want them now? No.
|
||||
|
||||
If I had kids, I'd become a parent, and parents, as I'd known since I was a kid, were uncool. They were dull and responsible and had no fun. And while it's not surprising that kids would believe that, to be honest I hadn't seen much as an adult to change my mind. Whenever I'd noticed parents with kids, the kids seemed to be terrors, and the parents pathetic harried creatures, even when they prevailed.
|
||||
|
||||
When people had babies, I congratulated them enthusiastically, because that seemed to be what one did. But I didn't feel it at all. "Better you than me," I was thinking.
|
||||
|
||||
Now when people have babies I congratulate them enthusiastically and I mean it. Especially the first one. I feel like they just got the best gift in the world.
|
||||
|
||||
What changed, of course, is that I had kids. Something I dreaded turned out to be wonderful.
|
||||
|
||||
Partly, and I won't deny it, this is because of serious chemical changes that happened almost instantly when our first child was born. It was like someone flipped a switch. I suddenly felt protective not just toward our child, but toward all children. As I was driving my wife and new son home from the hospital, I approached a crosswalk full of pedestrians, and I found myself thinking "I have to be really careful of all these people. Every one of them is someone's child!"
|
||||
|
||||
So to some extent you can't trust me when I say having kids is great. To some extent I'm like a religious cultist telling you that you'll be happy if you join the cult too but only because joining the cult will alter your mind in a way that will make you happy to be a cult member.
|
||||
|
||||
But not entirely. There were some things about having kids that I clearly got wrong before I had them.
|
||||
|
||||
For example, there was a huge amount of selection bias in my observations of parents and children. Some parents may have noticed that I wrote "Whenever I'd noticed parents with kids." Of course the times I noticed kids were when things were going wrong. I only noticed them when they made noise. And where was I when I noticed them? Ordinarily I never went to places with kids, so the only times I encountered them were in shared bottlenecks like airplanes. Which is not exactly a representative sample. Flying with a toddler is something very few parents enjoy.
|
||||
|
||||
What I didn't notice, because they tend to be much quieter, were all the great moments parents had with kids. People don't talk about these much the magic is hard to put into words, and all other parents know about them anyway but one of the great things about having kids is that there are so many times when you feel there is nowhere else you'd rather be, and nothing else you'd rather be doing. You don't have to be doing anything special. You could just be going somewhere together, or putting them to bed, or pushing them on the swings at the park. But you wouldn't trade these moments for anything. One doesn't tend to associate kids with peace, but that's what you feel. You don't need to look any further than where you are right now.
|
||||
|
||||
Before I had kids, I had moments of this kind of peace, but they were rarer. With kids it can happen several times a day.
|
||||
|
||||
My other source of data about kids was my own childhood, and that was similarly misleading. I was pretty bad, and was always in trouble for something or other. So it seemed to me that parenthood was essentially law enforcement. I didn't realize there were good times too.
|
||||
|
||||
I remember my mother telling me once when I was about 30 that she'd really enjoyed having me and my sister. My god, I thought, this woman is a saint. She not only endured all the pain we subjected her to, but actually enjoyed it? Now I realize she was simply telling the truth.
|
||||
|
||||
She said that one reason she liked having us was that we'd been interesting to talk to. That took me by surprise when I had kids. You don't just love them. They become your friends too. They're really interesting. And while I admit small children are disastrously fond of repetition (anything worth doing once is worth doing fifty times) it's often genuinely fun to play with them. That surprised me too. Playing with a 2 year old was fun when I was 2 and definitely not fun when I was 6. Why would it become fun again later? But it does.
|
||||
|
||||
There are of course times that are pure drudgery. Or worse still, terror. Having kids is one of those intense types of experience that are hard to imagine unless you've had them. But it is not, as I implicitly believed before having kids, simply your DNA heading for the lifeboats.
|
||||
|
||||
Some of my worries about having kids were right, though. They definitely make you less productive. I know having kids makes some people get their act together, but if your act was already together, you're going to have less time to do it in. In particular, you're going to have to work to a schedule. Kids have schedules. I'm not sure if it's because that's how kids are, or because it's the only way to integrate their lives with adults', but once you have kids, you tend to have to work on their schedule.
|
||||
|
||||
You will have chunks of time to work. But you can't let work spill promiscuously through your whole life, like I used to before I had kids. You're going to have to work at the same time every day, whether inspiration is flowing or not, and there are going to be times when you have to stop, even if it is.
|
||||
|
||||
I've been able to adapt to working this way. Work, like love, finds a way. If there are only certain times it can happen, it happens at those times. So while I don't get as much done as before I had kids, I get enough done.
|
||||
|
||||
I hate to say this, because being ambitious has always been a part of my identity, but having kids may make one less ambitious. It hurts to see that sentence written down. I squirm to avoid it. But if there weren't something real there, why would I squirm? The fact is, once you have kids, you're probably going to care more about them than you do about yourself. And attention is a zero-sum game. Only one idea at a time can be the [_top idea in your mind_](top.html). Once you have kids, it will often be your kids, and that means it will less often be some project you're working on.
|
||||
|
||||
I have some hacks for sailing close to this wind. For example, when I write essays, I think about what I'd want my kids to know. That drives me to get things right. And when I was writing [_Bel_](bel.html), I told my kids that once I finished it I'd take them to Africa. When you say that sort of thing to a little kid, they treat it as a promise. Which meant I had to finish or I'd be taking away their trip to Africa. Maybe if I'm really lucky such tricks could put me net ahead. But the wind is there, no question.
|
||||
|
||||
On the other hand, what kind of wimpy ambition do you have if it won't survive having kids? Do you have so little to spare?
|
||||
|
||||
And while having kids may be warping my present judgement, it hasn't overwritten my memory. I remember perfectly well what life was like before. Well enough to miss some things a lot, like the ability to take off for some other country at a moment's notice. That was so great. Why did I never do that?
|
||||
|
||||
See what I did there? The fact is, most of the freedom I had before kids, I never used. I paid for it in loneliness, but I never used it.
|
||||
|
||||
I had plenty of happy times before I had kids. But if I count up happy moments, not just potential happiness but actual happy moments, there are more after kids than before. Now I practically have it on tap, almost any bedtime.
|
||||
|
||||
People's experiences as parents vary a lot, and I know I've been lucky. But I think the worries I had before having kids must be pretty common, and judging by other parents' faces when they see their kids, so must the happiness that kids bring.
|
||||
|
||||
---
|
||||
**Note**
|
||||
|
||||
[1] Adults are sophisticated enough to see 2 year olds for the fascinatingly complex characters they are, whereas to most 6 year olds, 2 year olds are just defective 6 year olds.
|
||||
|
||||
|
||||
**Thanks** to Trevor Blackwell, Jessica Livingston, and Robert Morris for reading drafts of this.
|
||||
|
||||
---
|
||||
[Arabic Translation]( https://tldrarabiccontents.blogspot.com/2020/02/blog- post_3.html) [Slovak Translation](https://otcom.sk/paul-graham-mat-deti/)
|
||||
|
||||
---
|
||||
41
tests/data/markdown/how_y_combinator_started.md
Normal file
41
tests/data/markdown/how_y_combinator_started.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# 145 How Y Combinator Started
|
||||
|
||||
|
||||
|
||||
March 2012
|
||||
|
||||
Y Combinator's 7th birthday was March 11. As usual we were so busy we didn't notice till a few days after. I don't think we've ever managed to remember our birthday on our birthday.
|
||||
|
||||
On March 11 2005, Jessica and I were walking home from dinner in Harvard Square. Jessica was working at an investment bank at the time, but she didn't like it much, so she had interviewed for a job as director of marketing at a Boston VC fund. The VC fund was doing what now seems a comically familiar thing for a VC fund to do: taking a long time to make up their mind. Meanwhile I had been telling Jessica all the things they should change about the VC business essentially the ideas now underlying Y Combinator: investors should be making more, smaller investments, they should be funding hackers instead of suits, they should be willing to fund younger founders, etc.
|
||||
|
||||
At the time I had been thinking about doing some angel investing. I had just given a talk to the undergraduate computer club at Harvard about [how to start a startup](start.html), and it hit me afterward that although I had always meant to do angel investing, 7 years had now passed since I got enough money to do it, and I still hadn't started. I had also been thinking about ways to work with Robert Morris and Trevor Blackwell again. A few hours before I had sent them an email trying to figure out what we could do together.
|
||||
|
||||
Between Harvard Square and my house the idea gelled. We'd start our own investment firm and Jessica could work for that instead. As we turned onto Walker Street we decided to do it. I agreed to put $100k into the new fund and Jessica agreed to quit her job to work for it. Over the next couple days I recruited Robert and Trevor, who put in another $50k each. So YC started with $200k.
|
||||
|
||||
Jessica was so happy to be able to quit her job and start her own company that I took her [picture](https://web.archive.org/web/20170609055553/http://www.ycombinator.com/yc05.html) when we got home.
|
||||
|
||||
The company wasn't called Y Combinator yet. At first we called it Cambridge Seed. But that name never saw the light of day, because by the time we announced it a few days later, we'd changed the name to Y Combinator. We realized early on that what we were doing could be national in scope and we didn't want a name that tied us to one place.
|
||||
|
||||
Initially we only had part of the idea. We were going to do seed funding with standardized terms. Before YC, seed funding was very haphazard. You'd get that first $10k from your friend's rich uncle. The deal terms were often a disaster; often neither the investor nor the founders nor the lawyer knew what the documents should look like. Facebook's early history as a Florida LLC shows how random things could be in those days. We were going to be something there had not been before: a standard source of seed funding.
|
||||
|
||||
We modelled YC on the seed funding we ourselves had taken when we started Viaweb. We started Viaweb with $10k we got from our friend [Julian Weber](julian.html), the husband of Idelle Weber, whose painting class I took as a grad student at Harvard. Julian knew about business, but you would not describe him as a suit. Among other things he'd been president of the _National Lampoon_. He was also a lawyer, and got all our paperwork set up properly. In return for $10k, getting us set up as a company, teaching us what business was about, and remaining calm in times of crisis, Julian got 10% of Viaweb. I remember thinking once what a good deal Julian got. And then a second later I realized that without Julian, Viaweb would never have made it. So even though it was a good deal for him, it was a good deal for us too. That's why I knew there was room for something like Y Combinator.
|
||||
|
||||
Initially we didn't have what turned out to be the most important idea: funding startups synchronously, instead of asynchronously as it had always been done before. Or rather we had the idea, but we didn't realize its significance. We decided very early that the first thing we'd do would be to fund a bunch of startups over the coming summer. But we didn't realize initially that this would be the way we'd do all our investing. The reason we began by funding a bunch of startups at once was not that we thought it would be a better way to fund startups, but simply because we wanted to learn how to be angel investors, and a summer program for undergrads seemed the fastest way to do it. No one takes summer jobs that seriously. The opportunity cost for a bunch of undergrads to spend a summer working on startups was low enough that we wouldn't feel guilty encouraging them to do it.
|
||||
|
||||
We knew students would already be making plans for the summer, so we did what we're always telling startups to do: we launched fast. Here are the initial [announcement](summerfounder.html) and [description](https://web.archive.org/web/20170609055553/http://ycombinator.com/old/sfp.html) of what was at the time called the Summer Founders Program.
|
||||
|
||||
We got lucky in that the length and structure of a summer program turns out to be perfect for what we do. The structure of the YC cycle is still almost identical to what it was that first summer.
|
||||
|
||||
We also got lucky in who the first batch of founders were. We never expected to make any money from that first batch. We thought of the money we were investing as a combination of an educational expense and a charitable donation. But the founders in the first batch turned out to be surprisingly good. And great people too. We're still friends with a lot of them today.
|
||||
|
||||
It's hard for people to realize now how inconsequential YC seemed at the time. I can't blame people who didn't take us seriously, because we ourselves didn't take that first summer program seriously in the very beginning. But as the summer progressed we were increasingly impressed by how well the startups were doing. Other people started to be impressed too. Jessica and I invented a term, "the Y Combinator effect," to describe the moment when the realization hit someone that YC was not totally lame. When people came to YC to speak at the dinners that first summer, they came in the spirit of someone coming to address a Boy Scout troop. By the time they left the building they were all saying some variant of "Wow, these companies might actually succeed."
|
||||
|
||||
Now YC is well enough known that people are no longer surprised when the companies we fund are legit, but it took a while for reputation to catch up with reality. That's one of the reasons we especially like funding ideas that might be dismissed as "toys" because YC itself was dismissed as one initially.
|
||||
|
||||
When we saw how well it worked to fund companies synchronously, we decided we'd keep doing that. We'd fund two batches of startups a year.
|
||||
|
||||
We funded the second batch in Silicon Valley. That was a last minute decision. In retrospect I think what pushed me over the edge was going to Foo Camp that fall. The density of startup people in the Bay Area was so much greater than in Boston, and the weather was so nice. I remembered that from living there in the 90s. Plus I didn't want someone else to copy us and describe it as the Y Combinator of Silicon Valley. I wanted YC to be the Y Combinator of Silicon Valley. So doing the winter batch in California seemed like one of those rare cases where the self-indulgent choice and the ambitious one were the same.
|
||||
|
||||
If we'd had enough time to do what we wanted, Y Combinator would have been in Berkeley. That was our favorite part of the Bay Area. But we didn't have time to get a building in Berkeley. We didn't have time to get our own building anywhere. The only way to get enough space in time was to convince Trevor to let us take over part of his (as it then seemed) giant building in Mountain View. Yet again we lucked out, because Mountain View turned out to be the ideal place to put something like YC. But even then we barely made it. The first dinner in California, we had to warn all the founders not to touch the walls, because the paint was still wet.
|
||||
|
||||
* * *
|
||||
@@ -1,69 +0,0 @@
|
||||
# Emacs Khoj
|
||||
|
||||
*An Emacs interface for [Khoj](https://github.com/debanjum/khoj)*
|
||||
|
||||
## Requirements
|
||||
|
||||
- Install and Run [Khoj](https://github.com/debanjum/khoj)
|
||||
|
||||
## Installation
|
||||
|
||||
- Direct Install
|
||||
- Put `khoj.el` in your Emacs load path. For e.g \~/.emacs.d/lisp
|
||||
|
||||
- Load via `use-package` in your \~/.emacs.d/init.el or .emacs
|
||||
file by adding below snippet
|
||||
|
||||
``` elisp
|
||||
;; Khoj Package
|
||||
(use-package khoj
|
||||
:load-path "~/.emacs.d/lisp/khoj.el"
|
||||
:bind ("C-c s" . 'khoj))
|
||||
```
|
||||
- With [straight.el](https://github.com/raxod502/straight.el)
|
||||
- Add below snippet to your \~/.emacs.d/init.el or .emacs config
|
||||
file and execute it.
|
||||
|
||||
``` elisp
|
||||
;; Khoj Package for Semantic Search
|
||||
(use-package khoj
|
||||
:after org
|
||||
:straight (khoj :type git :host github :repo "debanjum/khoj" :files (:defaults "src/interface/emacs/khoj.el"))
|
||||
:bind ("C-c s" . 'khoj))
|
||||
```
|
||||
- With [Quelpa](https://github.com/quelpa/quelpa#installation)
|
||||
- Ensure [Quelpa](https://github.com/quelpa/quelpa#installation),
|
||||
[quelpa-use-package](https://github.com/quelpa/quelpa-use-package#installation)
|
||||
are installed
|
||||
|
||||
- Add below snippet to your \~/.emacs.d/init.el or .emacs config
|
||||
file and execute it.
|
||||
|
||||
``` elisp
|
||||
;; Khoj Package
|
||||
(use-package khoj
|
||||
:after org
|
||||
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/debanjum/khoj/master/interface/emacs/khoj.el")
|
||||
:bind ("C-c s" . 'khoj))
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Open Query Interface on Client
|
||||
|
||||
- In Emacs: Call `khoj` using keybinding `C-c s` or `M-x khoj`
|
||||
- On Web: Open <http://localhost:8000/>
|
||||
|
||||
2. Query in Natural Language
|
||||
|
||||
e.g \"What is the meaning of life?\" \"What are my life goals?\"
|
||||
|
||||
**Note: It takes about 4s on a Mac M1 and a \>100K line corpus of
|
||||
notes**
|
||||
|
||||
3. (Optional) Narrow down results further
|
||||
|
||||
Include/Exclude specific words or date range from results by
|
||||
updating query with below query format
|
||||
|
||||
e.g \`What is the meaning of life? -god +none dt:\"last week\"\`
|
||||
73
tests/data/markdown/jessica_livingston.md
Normal file
73
tests/data/markdown/jessica_livingston.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# 172 Jessica Livingston
|
||||
|
||||
|
||||
|
||||
November 2015
|
||||
|
||||
A few months ago an article about Y Combinator said that early on it had been a "one-man show." It's sadly common to read that sort of thing. But the problem with that description is not just that it's unfair. It's also misleading. Much of what's most novel about YC is due to Jessica Livingston. If you don't understand her, you don't understand YC. So let me tell you a little about Jessica.
|
||||
|
||||
YC had 4 founders. Jessica and I decided one night to start it, and the next day we recruited my friends Robert Morris and Trevor Blackwell. Jessica and I ran YC day to day, and Robert and Trevor read applications and did interviews with us.
|
||||
|
||||
Jessica and I were already dating when we started YC. At first we tried to act "professional" about this, meaning we tried to conceal it. In retrospect that seems ridiculous, and we soon dropped the pretense. And the fact that Jessica and I were a couple is a big part of what made YC what it was. YC felt like a family. The founders early on were mostly young. We all had dinner together once a week, cooked for the first couple years by me. Our first building had been a private home. The overall atmosphere was shockingly different from a VC's office on Sand Hill Road, in a way that was entirely for the better. There was an authenticity that everyone who walked in could sense. And that didn't just mean that people trusted us. It was the perfect quality to instill in startups. Authenticity is one of the most important things YC looks for in founders, not just because fakers and opportunists are annoying, but because authenticity is one of the main things that separates the most successful startups from the rest.
|
||||
|
||||
Early YC was a family, and Jessica was its mom. And the culture she defined was one of YC's most important innovations. Culture is important in any organization, but at YC culture wasn't just how we behaved when we built the product. At YC, the culture was the product.
|
||||
|
||||
Jessica was also the mom in another sense: she had the last word. Everything we did as an organization went through her first -- who to fund, what to say to the public, how to deal with other companies, who to hire, everything.
|
||||
|
||||
Before we had kids, YC was more or less our life. There was no real distinction between working hours and not. We talked about YC all the time. And while there might be some businesses that it would be tedious to let infect your private life, we liked it. We'd started YC because it was something we were interested in. And some of the problems we were trying to solve were endlessly difficult. How do you recognize good founders? You could talk about that for years, and we did; we still do.
|
||||
|
||||
I'm better at some things than Jessica, and she's better at some things than me. One of the things she's best at is judging people. She's one of those rare individuals with x-ray vision for character. She can see through any kind of faker almost immediately. Her nickname within YC was the Social Radar, and this special power of hers was critical in making YC what it is. The earlier you pick startups, the more you're picking the founders. Later stage investors get to try products and look at growth numbers. At the stage where YC invests, there is often neither a product nor any numbers.
|
||||
|
||||
Others thought YC had some special insight about the future of technology. Mostly we had the same sort of insight Socrates claimed: we at least knew we knew nothing. What made YC successful was being able to pick good founders. We thought Airbnb was a bad idea. We funded it because we liked the founders.
|
||||
|
||||
During interviews, Robert and Trevor and I would pepper the applicants with technical questions. Jessica would mostly watch. A lot of the applicants probably read her as some kind of secretary, especially early on, because she was the one who'd go out and get each new group and she didn't ask many questions. She was ok with that. It was easier for her to watch people if they didn't notice her. But after the interview, the three of us would turn to Jessica and ask "What does the Social Radar say?" [1]
|
||||
|
||||
Having the Social Radar at interviews wasn't just how we picked founders who'd be successful. It was also how we picked founders who were good people. At first we did this because we couldn't help it. Imagine what it would feel like to have x-ray vision for character. Being around bad people would be intolerable. So we'd refuse to fund founders whose characters we had doubts about even if we thought they'd be successful.
|
||||
|
||||
Though we initially did this out of self-indulgence, it turned out to be very valuable to YC. We didn't realize it in the beginning, but the people we were picking would become the YC alumni network. And once we picked them, unless they did something really egregious, they were going to be part of it for life. Some now think YC's alumni network is its most valuable feature. I personally think YC's advice is pretty good too, but the alumni network is certainly among the most valuable features. The level of trust and helpfulness is remarkable for a group of such size. And Jessica is the main reason why.
|
||||
|
||||
(As we later learned, it probably cost us little to reject people whose characters we had doubts about, because how good founders are and how well they do are [_not orthogonal_](mean.html). If bad founders succeed at all, they tend to sell early. The most successful founders are almost all good.)
|
||||
|
||||
If Jessica was so important to YC, why don't more people realize it? Partly because I'm a writer, and writers always get disproportionate attention. YC's brand was initially my brand, and our applicants were people who'd read my essays. But there is another reason: Jessica hates attention. Talking to reporters makes her nervous. The thought of giving a talk paralyzes her. She was even uncomfortable at our wedding, because the bride is always the center of attention. [2]
|
||||
|
||||
It's not just because she's shy that she hates attention, but because it throws off the Social Radar. She can't be herself. You can't watch people when everyone is watching you.
|
||||
|
||||
Another reason attention worries her is that she hates bragging. In anything she does that's publicly visible, her biggest fear (after the obvious fear that it will be bad) is that it will seem ostentatious. She says being too modest is a common problem for women. But in her case it goes beyond that. She has a horror of ostentation so visceral it's almost a phobia.
|
||||
|
||||
She also hates fighting. She can't do it; she just shuts down. And unfortunately there is a good deal of fighting in being the public face of an organization.
|
||||
|
||||
So although Jessica more than anyone made YC unique, the very qualities that enabled her to do it mean she tends to get written out of YC's history. Everyone buys this story that PG started YC and his wife just kind of helped. Even YC's haters buy it. A couple years ago when people were attacking us for not funding more female founders (than exist), they all treated YC as identical with PG. It would have spoiled the narrative to acknowledge Jessica's central role at YC.
|
||||
|
||||
Jessica was boiling mad that people were accusing _her_ company of sexism. I've never seen her angrier about anything. But she did not contradict them. Not publicly. In private there was a great deal of profanity. And she wrote three separate essays about the question of female founders. But she could never bring herself to publish any of them. She'd seen the level of vitriol in this debate, and she shrank from engaging. [3]
|
||||
|
||||
It wasn't just because she disliked fighting. She's so sensitive to character that it repels her even to fight with dishonest people. The idea of mixing it up with linkbait journalists or Twitter trolls would seem to her not merely frightening, but disgusting.
|
||||
|
||||
But Jessica knew her example as a successful female founder would encourage more women to start companies, so last year she did something YC had never done before and hired a PR firm to get her some interviews. At one of the first she did, the reporter brushed aside her insights about startups and turned it into a sensationalistic story about how some guy had tried to chat her up as she was waiting outside the bar where they had arranged to meet. Jessica was mortified, partly because the guy had done nothing wrong, but more because the story treated her as a victim significant only for being a woman, rather than one of the most knowledgeable investors in the Valley.
|
||||
|
||||
After that she told the PR firm to stop.
|
||||
|
||||
You're not going to be hearing in the press about what Jessica has achieved. So let me tell you what Jessica has achieved. Y Combinator is fundamentally a nexus of people, like a university. It doesn't make a product. What defines it is the people. Jessica more than anyone curated and nurtured that collection of people. In that sense she literally made YC.
|
||||
|
||||
Jessica knows more about the qualities of startup founders than anyone else ever has. Her immense data set and x-ray vision are the perfect storm in that respect. The qualities of the founders are the best predictor of how a startup will do. And startups are in turn the most important source of growth in mature economies.
|
||||
|
||||
The person who knows the most about the most important factor in the growth of mature economies -- that is who Jessica Livingston is. Doesn't that sound like someone who should be better known?
|
||||
|
||||
|
||||
---
|
||||
**Notes**
|
||||
|
||||
[1] Harj Taggar reminded me that while Jessica didn't ask many questions, they tended to be important ones:
|
||||
|
||||
"She was always good at sniffing out any red flags about the team or their determination and disarmingly asking the right question, which usually revealed more than the founders realized."
|
||||
|
||||
[2] Or more precisely, while she likes getting attention in the sense of getting credit for what she has done, she doesn't like getting attention in the sense of being watched in real time. Unfortunately, not just for her but for a lot of people, how much you get of the former depends a lot on how much you get of the latter.
|
||||
|
||||
Incidentally, if you saw Jessica at a public event, you would never guess she hates attention, because (a) she is very polite and (b) when she's nervous, she expresses it by smiling more.
|
||||
|
||||
[3] The existence of people like Jessica is not just something the mainstream media needs to learn to acknowledge, but something feminists need to learn to acknowledge as well. There are successful women who don't like to fight. Which means if the public conversation about women consists of fighting, their voices will be silenced.
|
||||
|
||||
There's a sort of Gresham's Law of conversations. If a conversation reaches a certain level of incivility, the more thoughtful people start to leave. No one understands female founders better than Jessica. But it's unlikely anyone will ever hear her speak candidly about the topic. She ventured a toe in that water a while ago, and the reaction was so violent that she decided "never again."
|
||||
|
||||
**Thanks** to Sam Altman, Paul Buchheit, Patrick Collison, Daniel Gackle, Carolynn Levy, Jon Levy, Kirsty Nathoo, Robert Morris, Geoff Ralston, and Harj Taggar for reading drafts of this. And yes, Jessica Livingston, who made me cut surprisingly little.
|
||||
|
||||
* * *
|
||||
@@ -1,144 +0,0 @@
|
||||

|
||||

|
||||
|
||||
# Khoj
|
||||
|
||||
*Allow natural language search on user content like notes, images,
|
||||
transactions using transformer ML models*
|
||||
|
||||
User can interface with Khoj via [Web](./src/khoj/interface/web/index.html),
|
||||
[Emacs](./src/khoj/interface/emacs/khoj.el) or the API. All search is done
|
||||
locally[\*](https://github.com/debanjum/khoj#miscellaneous)
|
||||
|
||||
## Demo
|
||||
|
||||
<https://user-images.githubusercontent.com/6413477/168417719-8a8bc4e5-8404-42b2-89a7-4493e3d2582c.mp4>
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Clone
|
||||
|
||||
``` shell
|
||||
git clone https://github.com/debanjum/khoj && cd khoj
|
||||
```
|
||||
|
||||
### 2. Configure
|
||||
|
||||
- \[Required\] Update [docker-compose.yml](./docker-compose.yml) to
|
||||
mount your images, (org-mode or markdown) notes and beancount
|
||||
directories
|
||||
- \[Optional\] Edit application configuration in
|
||||
[khoj_sample.yml](./config/khoj_sample.yml)
|
||||
|
||||
### 3. Run
|
||||
|
||||
``` shell
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
*Note: The first run will take time. Let it run, it\'s mostly not hung,
|
||||
just generating embeddings*
|
||||
|
||||
## Use
|
||||
|
||||
- **Khoj via API**
|
||||
- See [Khoj API Docs](http://localhost:8000/docs)
|
||||
- [Query](http://localhost:8000/api/search?q=%22what%20is%20the%20meaning%20of%20life%22)
|
||||
- [Update Index](http://localhost:8000/api/update?t=ledger)
|
||||
- [Configure Application](https://localhost:8000/ui)
|
||||
- **Khoj via Emacs**
|
||||
- [Install](https://github.com/debanjum/khoj/tree/master/src/khoj/interface/emacs#installation)
|
||||
[khoj.el](./src/khoj/interface/emacs/khoj.el)
|
||||
- Run `M-x khoj <user-query>`
|
||||
|
||||
## Run Unit tests
|
||||
|
||||
``` shell
|
||||
pytest
|
||||
```
|
||||
|
||||
## Upgrade
|
||||
|
||||
``` shell
|
||||
docker-compose build --pull
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
- Symptom: Errors out with \"Killed\" in error message
|
||||
- Fix: Increase RAM available to Docker Containers in Docker
|
||||
Settings
|
||||
- Refer: [StackOverflow
|
||||
Solution](https://stackoverflow.com/a/50770267), [Configure
|
||||
Resources on Docker for
|
||||
Mac](https://docs.docker.com/desktop/mac/#resources)
|
||||
- Symptom: Errors out complaining about Tensors mismatch, null etc
|
||||
- Mitigation: Delete content-type \> image section from
|
||||
docker~sampleconfig~.yml
|
||||
|
||||
## Miscellaneous
|
||||
|
||||
- The experimental [chat](localhost:8000/chat) API endpoint uses the
|
||||
[OpenAI API](https://openai.com/api/)
|
||||
- It is disabled by default
|
||||
- To use it add your `openai-api-key` to config.yml
|
||||
|
||||
## Development Setup
|
||||
|
||||
### Setup on Local Machine
|
||||
|
||||
1. 1\. Install Dependencies
|
||||
|
||||
1. Install Python3 \[Required\]
|
||||
|
||||
2. [Install
|
||||
Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)
|
||||
\[Required\]
|
||||
|
||||
2. 2\. Install Khoj
|
||||
|
||||
``` shell
|
||||
git clone https://github.com/debanjum/khoj && cd khoj
|
||||
conda env create -f config/environment.yml
|
||||
conda activate khoj
|
||||
```
|
||||
|
||||
3. 3\. Configure
|
||||
|
||||
- Configure files/directories to search in `content-type` section
|
||||
of `khoj_sample.yml`
|
||||
- To run application on test data, update file paths containing
|
||||
`/data/` to `tests/data/` in `khoj_sample.yml`
|
||||
- Example replace `/data/org/*.org` with
|
||||
`tests/data/org/*.org`
|
||||
|
||||
4. 4\. Run
|
||||
|
||||
Load ML model, generate embeddings and expose API to query notes,
|
||||
images, transactions etc specified in config YAML
|
||||
|
||||
``` shell
|
||||
python3 -m src.khoj.main -c=config/khoj_sample.yml -vv
|
||||
```
|
||||
|
||||
### Upgrade On Local Machine
|
||||
|
||||
``` shell
|
||||
cd khoj
|
||||
git pull origin master
|
||||
conda deactivate khoj
|
||||
conda env update -f config/environment.yml
|
||||
conda activate khoj
|
||||
```
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
- [Multi-QA MiniLM
|
||||
Model](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)
|
||||
for Asymmetric Text Search. See [SBert
|
||||
Documentation](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)
|
||||
- [OpenAI CLIP Model](https://github.com/openai/CLIP) for Image
|
||||
Search. See [SBert
|
||||
Documentation](https://www.sbert.net/examples/applications/image-search/README.html)
|
||||
- Charles Cave for [OrgNode
|
||||
Parser](http://members.optusnet.com.au/~charles57/GTD/orgnode.html)
|
||||
161
tests/data/markdown/undergraduation.md
Normal file
161
tests/data/markdown/undergraduation.md
Normal file
@@ -0,0 +1,161 @@
|
||||
# Undergraduation
|
||||
|
||||
[](https://sep.yimg.com/ca/I/paulgraham_2202_8895101)
|
||||
|
||||
**Want to start a startup?** Get funded by [Y Combinator](http://ycombinator.com/apply.html).
|
||||
|
||||
|
||||
March 2005
|
||||
|
||||
_(Parts of this essay began as replies to students who wrote to me with questions.)_
|
||||
|
||||
Recently I've had several emails from computer science undergrads asking what to do in college. I might not be the best source of advice, because I was a philosophy major in college. But I took so many CS classes that most CS majors thought I was one. I was certainly a hacker, at least.
|
||||
|
||||
**Hacking**
|
||||
|
||||
What should you do in college to become a [good hacker](gh.html)? There are two main things you can do: become very good at programming, and learn a lot about specific, cool problems. These turn out to be equivalent, because each drives you to do the other.
|
||||
|
||||
The way to be good at programming is to work (a) a lot (b) on hard problems. And the way to make yourself work on hard problems is to work on some very engaging project.
|
||||
|
||||
Odds are this project won't be a class assignment. My friend Robert learned a lot by writing network software when he was an undergrad. One of his projects was to connect Harvard to the Arpanet; it had been one of the original nodes, but by 1984 the connection had died. [1] Not only was this work not for a class, but because he spent all his time on it and neglected his studies, he was kicked out of school for a year. [2] It all evened out in the end, and now he's a professor at MIT. But you'll probably be happier if you don't go to that extreme; it caused him a lot of worry at the time.
|
||||
|
||||
Another way to be good at programming is to find other people who are good at it, and learn what they know. Programmers tend to sort themselves into tribes according to the type of work they do and the tools they use, and some tribes are [smarter](pypar.html) than others. Look around you and see what the smart people seem to be working on; there's usually a reason.
|
||||
|
||||
Some of the smartest people around you are professors. So one way to find interesting work is to volunteer as a research assistant. Professors are especially interested in people who can solve tedious system-administration type problems for them, so that is a way to get a foot in the door. What they fear are flakes and resume padders. It's all too common for an assistant to result in a net increase in work. So you have to make it clear you'll mean a net decrease.
|
||||
|
||||
Don't be put off if they say no. Rejection is almost always less personal than the rejectee imagines. Just move on to the next. (This applies to dating too.)
|
||||
|
||||
Beware, because although most professors are smart, not all of them work on interesting stuff. Professors have to publish novel results to advance their careers, but there is more competition in more interesting areas of research. So what less ambitious professors do is turn out a series of papers whose conclusions are novel because no one else cares about them. You're better off avoiding these.
|
||||
|
||||
I never worked as a research assistant, so I feel a bit dishonest recommending that route. I learned to program by writing stuff of my own, particularly by trying to reverse-engineer Winograd's SHRDLU. I was as obsessed with that program as a mother with a new baby.
|
||||
|
||||
Whatever the disadvantages of working by yourself, the advantage is that the project is all your own. You never have to compromise or ask anyone's permission, and if you have a new idea you can just sit down and start implementing it.
|
||||
|
||||
In your own projects you don't have to worry about novelty (as professors do) or profitability (as businesses do). All that matters is how hard the project is technically, and that has no correlation to the nature of the application. "Serious" applications like databases are often trivial and dull technically (if you ever suffer from insomnia, try reading the technical literature about databases) while "frivolous" applications like games are often very sophisticated. I'm sure there are game companies out there working on products with more intellectual content than the research at the bottom nine tenths of university CS departments.
|
||||
|
||||
If I were in college now I'd probably work on graphics: a network game, for example, or a tool for 3D animation. When I was an undergrad there weren't enough cycles around to make graphics interesting, but it's hard to imagine anything more fun to work on now.
|
||||
|
||||
**Math**
|
||||
|
||||
When I was in college, a lot of the professors believed (or at least wished) that [computer science](hp.html) was a branch of math. This idea was strongest at Harvard, where there wasn't even a CS major till the 1980s; till then one had to major in applied math. But it was nearly as bad at Cornell. When I told the fearsome Professor Conway that I was interested in AI (a hot topic then), he told me I should major in math. I'm still not sure whether he thought AI required math, or whether he thought AI was nonsense and that majoring in something rigorous would cure me of such stupid ambitions.
|
||||
|
||||
In fact, the amount of math you need as a hacker is a lot less than most university departments like to admit. I don't think you need much more than high school math plus a few concepts from the theory of computation. (You have to know what an n^2 algorithm is if you want to avoid writing them.) Unless you're planning to write math applications, of course. Robotics, for example, is all math.
|
||||
|
||||
But while you don't literally need math for most kinds of hacking, in the sense of knowing 1001 tricks for differentiating formulas, math is very much worth studying for its own sake. It's a valuable source of metaphors for almost any kind of work.[3] I wish I'd studied more math in college for that reason.
|
||||
|
||||
Like a lot of people, I was mathematically abused as a child. I learned to think of math as a collection of formulas that were neither beautiful nor had any relation to my life (despite attempts to translate them into "word problems"), but had to be memorized in order to do well on tests.
|
||||
|
||||
One of the most valuable things you could do in college would be to learn what math is really about. This may not be easy, because a lot of good mathematicians are bad teachers. And while there are many popular books on math, few seem good. The best I can think of are W. W. Sawyer's. And of course Euclid. [4]
|
||||
|
||||
**Everything**
|
||||
|
||||
Thomas Huxley said "Try to learn something about everything and everything about something." Most universities aim at this ideal.
|
||||
|
||||
But what's everything? To me it means, all that people learn in the course of working honestly on hard problems. All such work tends to be related, in that ideas and techniques from one field can often be transplanted successfully to others. Even others that seem quite distant. For example, I write [essays](essay.html) the same way I write software: I sit down and blow out a lame version 1 as fast as I can type, then spend several weeks rewriting it.
|
||||
|
||||
Working on hard problems is not, by itself, enough. Medieval alchemists were working on a hard problem, but their approach was so bogus that there was little to learn from studying it, except possibly about people's ability to delude themselves. Unfortunately the sort of AI I was trying to learn in college had the same flaw: a very hard problem, blithely approached with hopelessly inadequate techniques. Bold? Closer to fraudulent.
|
||||
|
||||
The social sciences are also fairly bogus, because they're so much influenced by intellectual [fashions](say.html). If a physicist met a colleague from 100 years ago, he could teach him some new things; if a psychologist met a colleague from 100 years ago, they'd just get into an ideological argument. Yes, of course, you'll learn something by taking a psychology class. The point is, you'll learn more by taking a class in another department.
|
||||
|
||||
The worthwhile departments, in my opinion, are math, the hard sciences, engineering, history (especially economic and social history, and the history of science), architecture, and the classics. A survey course in art history may be worthwhile. Modern literature is important, but the way to learn about it is just to read. I don't know enough about music to say.
|
||||
|
||||
You can skip the social sciences, philosophy, and the various departments created recently in response to political pressures. Many of these fields talk about important problems, certainly. But the way they talk about them is useless. For example, philosophy talks, among other things, about our obligations to one another; but you can learn more about this from a wise grandmother or E. B. White than from an academic philosopher.
|
||||
|
||||
I speak here from experience. I should probably have been offended when people laughed at Clinton for saying "It depends on what the meaning of the word 'is' is." I took about five classes in college on what the meaning of "is" is.
|
||||
|
||||
Another way to figure out which fields are worth studying is to create the _dropout graph._ For example, I know many people who switched from math to computer science because they found math too hard, and no one who did the opposite. People don't do hard things gratuitously; no one will work on a harder problem unless it is proportionately (or at least log(n)) more rewarding. So probably math is more worth studying than computer science. By similar comparisons you can make a graph of all the departments in a university. At the bottom you'll find the subjects with least intellectual content.
|
||||
|
||||
If you use this method, you'll get roughly the same answer I just gave.
|
||||
|
||||
Language courses are an anomaly. I think they're better considered as extracurricular activities, like pottery classes. They'd be far more useful when combined with some time living in a country where the language is spoken. On a whim I studied Arabic as a freshman. It was a lot of work, and the only lasting benefits were a weird ability to identify semitic roots and some insights into how people recognize words.
|
||||
|
||||
Studio art and creative writing courses are wildcards. Usually you don't get taught much: you just work (or don't work) on whatever you want, and then sit around offering "crits" of one another's creations under the vague supervision of the teacher. But writing and art are both very hard problems that (some) people work honestly at, so they're worth doing, especially if you can find a good teacher.
|
||||
|
||||
**Jobs**
|
||||
|
||||
Of course college students have to think about more than just learning. There are also two practical problems to consider: jobs, and graduate school.
|
||||
|
||||
In theory a liberal education is not supposed to supply job training. But everyone knows this is a bit of a fib. Hackers at every college learn practical skills, and not by accident.
|
||||
|
||||
What you should learn to get a job depends on the kind you want. If you want to work in a big company, learn how to hack [Blub](avg.html) on Windows. If you want to work at a cool little company or research lab, you'll do better to learn Ruby on Linux. And if you want to start your own company, which I think will be more and more common, master the most powerful tools you can find, because you're going to be in a race against your competitors, and they'll be your horse.
|
||||
|
||||
There is not a direct correlation between the skills you should learn in college and those you'll use in a job. You should aim slightly high in college.
|
||||
|
||||
In workouts a football player may bench press 300 pounds, even though he may never have to exert anything like that much force in the course of a game. Likewise, if your professors try to make you learn stuff that's more advanced than you'll need in a job, it may not just be because they're academics, detached from the real world. They may be trying to make you lift weights with your brain.
|
||||
|
||||
The programs you write in classes differ in three critical ways from the ones you'll write in the real world: they're small; you get to start from scratch; and the problem is usually artificial and predetermined. In the real world, programs are bigger, tend to involve existing code, and often require you to figure out what the problem is before you can solve it.
|
||||
|
||||
You don't have to wait to leave (or even enter) college to learn these skills. If you want to learn how to deal with existing code, for example, you can contribute to open-source projects. The sort of employer you want to work for will be as impressed by that as good grades on class assignments.
|
||||
|
||||
In existing open-source projects you don't get much practice at the third skill, deciding what problems to solve. But there's nothing to stop you starting new projects of your own. And good employers will be even more impressed with that.
|
||||
|
||||
What sort of problem should you try to solve? One way to answer that is to ask what you need as a user. For example, I stumbled on a good algorithm for spam filtering because I wanted to stop getting spam. Now what I wish I had was a mail reader that somehow prevented my inbox from filling up. I tend to use my inbox as a todo list. But that's like using a screwdriver to open bottles; what one really wants is a bottle opener.
|
||||
|
||||
**Grad School**
|
||||
|
||||
What about grad school? Should you go? And how do you get into a good one?
|
||||
|
||||
In principle, grad school is professional training in research, and you shouldn't go unless you want to do research as a career. And yet half the people who get PhDs in CS don't go into research. I didn't go to grad school to become a professor. I went because I wanted to learn more.
|
||||
|
||||
So if you're mainly interested in hacking and you go to grad school, you'll find a lot of other people who are similarly out of their element. And if half the people around you are out of their element in the same way you are, are you really out of your element?
|
||||
|
||||
There's a fundamental problem in "computer science," and it surfaces in situations like this. No one is sure what "research" is supposed to be. A lot of research is hacking that had to be crammed into the form of an academic paper to yield one more quantum of publication.
|
||||
|
||||
So it's kind of misleading to ask whether you'll be at home in grad school, because very few people are quite at home in computer science. The whole field is uncomfortable in its own skin. So the fact that you're mainly interested in hacking shouldn't deter you from going to grad school. Just be warned you'll have to do a lot of stuff you don't like.
|
||||
|
||||
Number one will be your dissertation. Almost everyone hates their dissertation by the time they're done with it. The process inherently tends to produce an unpleasant result, like a cake made out of whole wheat flour and baked for twelve hours. Few dissertations are read with pleasure, especially by their authors.
|
||||
|
||||
But thousands before you have suffered through writing a dissertation. And aside from that, grad school is close to paradise. Many people remember it as the happiest time of their lives. And nearly all the rest, including me, remember it as a period that would have been, if they hadn't had to write a dissertation. [5]
|
||||
|
||||
The danger with grad school is that you don't see the scary part upfront. PhD programs start out as college part 2, with several years of classes. So by the time you face the horror of writing a dissertation, you're already several years in. If you quit now, you'll be a grad-school dropout, and you probably won't like that idea. When Robert got kicked out of grad school for writing the Internet worm of 1988, I envied him enormously for finding a way out without the stigma of failure.
|
||||
|
||||
On the whole, grad school is probably better than most alternatives. You meet a lot of smart people, and your glum procrastination will at least be a powerful common bond. And of course you have a PhD at the end. I forgot about that. I suppose that's worth something.
|
||||
|
||||
The greatest advantage of a PhD (besides being the union card of academia, of course) may be that it gives you some baseline confidence. For example, the Honeywell thermostats in my house have the most atrocious UI. My mother, who has the same model, diligently spent a day reading the user's manual to learn how to operate hers. She assumed the problem was with her. But I can think to myself "If someone with a PhD in computer science can't understand this thermostat, it _must_ be badly designed."
|
||||
|
||||
If you still want to go to grad school after this equivocal recommendation, I can give you solid advice about how to get in. A lot of my friends are CS professors now, so I have the inside story about admissions. It's quite different from college. At most colleges, admissions officers decide who gets in. For PhD programs, the professors do. And they try to do it well, because the people they admit are going to be working for them.
|
||||
|
||||
Apparently only recommendations really matter at the best schools. Standardized tests count for nothing, and grades for little. The essay is mostly an opportunity to disqualify yourself by saying something stupid. The only thing professors trust is recommendations, preferably from people they know. [6]
|
||||
|
||||
So if you want to get into a PhD program, the key is to impress your professors. And from my friends who are professors I know what impresses them: not merely trying to impress them. They're not impressed by students who get good grades or want to be their research assistants so they can get into grad school. They're impressed by students who get good grades and want to be their research assistants because they're genuinely interested in the topic.
|
||||
|
||||
So the best thing you can do in college, whether you want to get into grad school or just be good at hacking, is figure out what you truly like. It's hard to trick professors into letting you into grad school, and impossible to trick problems into letting you solve them. College is where faking stops working. From this point, unless you want to go work for a big company, which is like reverting to high school, the only way forward is through doing what you [love](love.html).
|
||||
|
||||
|
||||
---
|
||||
**Notes**
|
||||
|
||||
[1] No one seems to have minded, which shows how unimportant the Arpanet (which became the Internet) was as late as 1984.
|
||||
|
||||
[2] This is why, when I became an employer, I didn't care about GPAs. In fact, we actively sought out people who'd failed out of school. We once put up posters around Harvard saying "Did you just get kicked out for doing badly in your classes because you spent all your time working on some project of your own? Come work for us!" We managed to find a kid who had been, and he was a great hacker.
|
||||
|
||||
When Harvard kicks undergrads out for a year, they have to get jobs. The idea is to show them how awful the real world is, so they'll understand how lucky they are to be in college. This plan backfired with the guy who came to work for us, because he had more fun than he'd had in school, and made more that year from stock options than any of his professors did in salary. So instead of crawling back repentant at the end of the year, he took another year off and went to Europe. He did eventually graduate at about 26.
|
||||
|
||||
[3] Eric Raymond says the best metaphors for hackers are in set theory, combinatorics, and graph theory.
|
||||
|
||||
Trevor Blackwell reminds you to take math classes intended for math majors. "'Math for engineers' classes sucked mightily. In fact any 'x for engineers' sucks, where x includes math, law, writing and visual design."
|
||||
|
||||
[4] Other highly recommended books: _What is Mathematics?_ , by Courant and Robbins; _Geometry and the Imagination_ by Hilbert and Cohn-Vossen. And for those interested in graphic design, [Byrne's Euclid](http://www.math.ubc.ca/people/faculty/cass/Euclid/byrne.html).
|
||||
|
||||
[5] If you wanted to have the perfect life, the thing to do would be to go to grad school, secretly write your dissertation in the first year or two, and then just enjoy yourself for the next three years, dribbling out a chapter at a time. This prospect will make grad students' mouths water, but I know of no one who's had the discipline to pull it off.
|
||||
|
||||
[6] One professor friend says that 15-20% of the grad students they admit each year are "long shots." But what he means by long shots are people whose applications are perfect in every way, except that no one on the admissions committee knows the professors who wrote the recommendations.
|
||||
|
||||
So if you want to get into grad school in the sciences, you need to go to college somewhere with real research professors. Otherwise you'll seem a risky bet to admissions committees, no matter how good you are.
|
||||
|
||||
Which implies a surprising but apparently inevitable consequence: little liberal arts colleges are doomed. Most smart high school kids at least consider going into the sciences, even if they ultimately choose not to. Why go to a college that limits their options?
|
||||
|
||||
|
||||
---
|
||||
|
||||
**Thanks** to Trevor Blackwell, Alex Lewin, Jessica Livingston, Robert Morris, Eric Raymond, and several [anonymous CS professors](undergrad2.html) for reading drafts of this, and to the students whose questions began it.
|
||||
|
||||
---
|
||||
|
||||
[More Advice for Undergrads](undergrad2.html)
|
||||
|
||||
[Joel Spolsky: Advice for Computer Science College Students](http://www.joelonsoftware.com/articles/CollegeAdvice.html)
|
||||
|
||||
[Eric Raymond: How to Become a Hacker](http://www.catb.org/~esr/faqs/hacker- howto.html)
|
||||
|
||||
* * *
|
||||
131
tests/data/markdown/what_i_did_this_summer.md
Normal file
131
tests/data/markdown/what_i_did_this_summer.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# 050 What I Did this Summer
|
||||
|
||||
[](https://sep.yimg.com/ca/I/paulgraham_2202_8808526)
|
||||
|
||||
|
||||
October 2005
|
||||
|
||||
The first Summer Founders Program has just finished. We were surprised how well it went. Overall only about 10% of startups succeed, but if I had to guess now, I'd predict three or four of the eight startups we funded will make it.
|
||||
|
||||
Of the startups that needed further funding, I believe all have either closed a round or are likely to soon. Two have already turned down (lowball) acquisition offers.
|
||||
|
||||
We would have been happy if just one of the eight seemed promising by the end of the summer. What's going on? Did some kind of anomaly make this summer's applicants especially good? We worry about that, but we can't think of one. We'll find out this winter.
|
||||
|
||||
The whole summer was full of surprises. The best was that the [hypothesis](hiring.html) we were testing seems to be correct. Young hackers can start viable companies. This is good news for two reasons: (a) it's an encouraging thought, and (b) it means that Y Combinator, which is predicated on the idea, is not hosed.
|
||||
|
||||
**Age**
|
||||
|
||||
More precisely, the hypothesis was that success in a startup depends mainly on how smart and energetic you are, and much less on how old you are or how much business experience you have. The results so far bear this out. The 2005 summer founders ranged in age from 18 to 28 (average 23), and there is no correlation between their ages and how well they're doing.
|
||||
|
||||
This should not really be surprising. Bill Gates and Michael Dell were both 19 when they started the companies that made them famous. Young founders are not a new phenomenon: the trend began as soon as computers got cheap enough for college kids to afford them.
|
||||
|
||||
Another of our hypotheses was that you can start a startup on less money than most people think. Other investors were surprised to hear the most we gave any group was $20,000. But we knew it was possible to start on that little because we started Viaweb on $10,000.
|
||||
|
||||
And so it proved this summer. Three months' funding is enough to get into second gear. We had a demo day for potential investors ten weeks in, and seven of the eight groups had a prototype ready by that time. One, [Reddit](http://reddit.com), had already launched, and were able to give a demo of their live site.
|
||||
|
||||
A researcher who studied the SFP startups said the one thing they had in common was that they all worked ridiculously hard. People this age are commonly seen as lazy. I think in some cases it's not so much that they lack the appetite for work, but that the work they're offered is unappetizing.
|
||||
|
||||
The experience of the SFP suggests that if you let motivated people do real work, they work hard, whatever their age. As one of the founders said "I'd read that starting a startup consumed your life, but I had no idea what that meant until I did it."
|
||||
|
||||
I'd feel guilty if I were a boss making people work this hard. But we're not these people's bosses. They're working on their own projects. And what makes them work is not us but their competitors. Like good athletes, they don't work hard because the coach yells at them, but because they want to win.
|
||||
|
||||
We have less power than bosses, and yet the founders work harder than employees. It seems like a win for everyone. The only catch is that we get on average only about 5-7% of the upside, while an employer gets nearly all of it. (We're counting on it being 5-7% of a much larger number.)
|
||||
|
||||
As well as working hard, the groups all turned out to be extraordinarily responsible. I can't think of a time when one failed to do something they'd promised to, even by being late for an appointment. This is another lesson the world has yet to learn. One of the founders discovered that the hardest part of arranging a meeting with executives at a big cell phone carrier was getting a rental company to rent him a car, because he was too young.
|
||||
|
||||
I think the problem here is much the same as with the apparent laziness of people this age. They seem lazy because the work they're given is pointless, and they act irresponsible because they're not given any power. Some of them, anyway. We only have a sample size of about twenty, but it seems so far that if you let people in their early twenties be their own bosses, they rise to the occasion.
|
||||
|
||||
**Morale**
|
||||
|
||||
The summer founders were as a rule very idealistic. They also wanted very much to get rich. These qualities might seem incompatible, but they're not. These guys want to get rich, but they want to do it by changing the world. They wouldn't (well, seven of the eight groups wouldn't) be interested in making money by speculating in stocks. They want to make something people use.
|
||||
|
||||
I think this makes them more effective as founders. As hard as people will work for money, they'll work harder for a cause. And since success in a startup depends so much on motivation, the paradoxical result is that the people likely to make the most money are those who aren't in it just for the money.
|
||||
|
||||
The founders of [Kiko](http://kiko.com), for example, are working on an Ajax calendar. They want to get rich, but they pay more attention to design than they would if that were their only motivation. You can tell just by looking at
|
||||
it.
|
||||
|
||||
|
||||
I never considered it till this summer, but this might be another reason startups run by hackers tend to do better than those run by MBAs. Perhaps it's not just that hackers understand technology better, but that they're driven by more powerful motivations. Microsoft, as I've said before, is a dangerously misleading example. Their mean corporate culture only works for monopolies. Google is a better model.
|
||||
|
||||
Considering that the summer founders are the sharks in this ocean, we were surprised how frightened most of them were of competitors. But now that I think of it, we were just as frightened when we started Viaweb. For the first year, our initial reaction to news of a competitor was always: we're doomed. Just as a hypochondriac magnifies his symptoms till he's convinced he has some terrible disease, when you're not used to competitors you magnify them into monsters.
|
||||
|
||||
Here's a handy rule for startups: competitors are rarely as dangerous as they seem. Most will self-destruct before you can destroy them. And it certainly doesn't matter how many of them there are, any more than it matters to the winner of a marathon how many runners are behind him.
|
||||
|
||||
"It's a crowded market," I remember one founder saying worriedly.
|
||||
|
||||
"Are you the current leader?" I asked.
|
||||
|
||||
"Yes."
|
||||
|
||||
"Is anyone able to develop software faster than you?"
|
||||
|
||||
"Probably not."
|
||||
|
||||
"Well, if you're ahead now, and you're the fastest, then you'll stay ahead. What difference does it make how many others there are?"
|
||||
|
||||
Another group was worried when they realized they had to rewrite their software from scratch. I told them it would be a bad sign if they didn't. The main function of your initial version is to be rewritten.
|
||||
|
||||
That's why we advise groups to ignore issues like scalability, internationalization, and heavy-duty security at first. [1] I can imagine an advocate of "best practices" saying these ought to be considered from the start. And he'd be right, except that they interfere with the primary function of software in a startup: to be a vehicle for experimenting with its own design. Having to retrofit internationalization or scalability is a pain, certainly. The only bigger pain is not needing to, because your initial version was too big and rigid to evolve into something users wanted.
|
||||
|
||||
I suspect this is another reason startups beat big companies. Startups can be irresponsible and release version 1s that are light enough to evolve. In big companies, all the pressure is in the direction of over-engineering.
|
||||
|
||||
**What Got Learned**
|
||||
|
||||
One thing we were curious about this summer was where these groups would need help. That turned out to vary a lot. Some we helped with technical advice-- for example, about how to set up an application to run on multiple servers. Most we helped with strategy questions, like what to patent, and what to charge for and what to give away. Nearly all wanted advice about dealing with future investors: how much money should they take and what kind of terms should they expect?
|
||||
|
||||
However, all the groups quickly learned how to deal with stuff like patents and investors. These problems aren't intrinsically difficult, just unfamiliar.
|
||||
|
||||
It was surprising-- slightly frightening even-- how fast they learned. The weekend before the demo day for investors, we had a practice session where all the groups gave their presentations. They were all terrible. We tried to explain how to make them better, but we didn't have much hope. So on demo day I told the assembled angels and VCs that these guys were hackers, not MBAs, and so while their software was good, we should not expect slick presentations from them.
|
||||
|
||||
The groups then proceeded to give fabulously slick presentations. Gone were the mumbling recitations of lists of features. It was as if they'd spent the past week at acting school. I still don't know how they did it.
|
||||
|
||||
Perhaps watching each others' presentations helped them see what they'd been doing wrong. Just as happens in college, the summer founders learned a lot from one another-- maybe more than they learned from us. A lot of the problems they face are the same, from dealing with investors to hacking Javascript.
|
||||
|
||||
I don't want to give the impression there were no problems this summer. A lot went wrong, as usually happens with startups. One group got an "[exploding term-sheet](http://www.ventureblog.com/articles/indiv/2003/000024.html)" from some VCs. Pretty much all the groups who had dealings with big companies found that big companies do everything infinitely slowly. (This is to be expected. If big companies weren't incapable, there would be no room for startups to exist.) And of course there were the usual nightmares associated with servers.
|
||||
|
||||
In short, the disasters this summer were just the usual childhood diseases. Some of this summer's eight startups will probably die eventually; it would be extraordinary if all eight succeeded. But what kills them will not be dramatic, external threats, but a mundane, internal one: not getting enough done.
|
||||
|
||||
So far, though, the news is all good. In fact, we were surprised how much fun the summer was for us. The main reason was how much we liked the founders. They're so earnest and hard-working. They seem to like us too. And this illustrates another advantage of investing over hiring: our relationship with them is way better than it would be between a boss and an employee. Y Combinator ends up being more like an older brother than a parent.
|
||||
|
||||
I was surprised how much time I spent making introductions. Fortunately I discovered that when a startup needed to talk to someone, I could usually get to the right person by at most one hop. I remember wondering, how did my friends get to be so eminent? and a second later realizing: shit, I'm forty.
|
||||
|
||||
Another surprise was that the three-month batch format, which we were forced into by the constraints of the summer, turned out to be an advantage. When we started Y Combinator, we planned to invest the way other venture firms do: as proposals came in, we'd evaluate them and decide yes or no. The SFP was just an experiment to get things started. But it worked so well that we plan to do [all](http://ycombinator.com/funding.html) our investing this way, one cycle in the summer and one in winter. It's more efficient for us, and better for the startups too.
|
||||
|
||||
Several groups said our weekly dinners saved them from a common problem afflicting startups: working so hard that one has no social life. (I remember that part all too well.) This way, they were guaranteed a social event at least once a week.
|
||||
|
||||
**Independence**
|
||||
|
||||
I've heard Y Combinator described as an "incubator." Actually we're the opposite: incubators exert more control than ordinary VCs, and we make a point of exerting less. Among other things, incubators usually make you work in their office-- that's where the word "incubator" comes from. That seems the wrong model. If investors get too involved, they smother one of the most powerful forces in a startup: the feeling that it's your own company.
|
||||
|
||||
Incubators were conspicuous failures during the Bubble. There's still debate about whether this was because of the Bubble, or because they're a bad idea. My vote is they're a bad idea. I think they fail because they select for the wrong people. When we were starting a startup, we would never have taken funding from an "incubator." We can find office space, thanks; just give us the money. And people with that attitude are the ones likely to succeed in startups.
|
||||
|
||||
Indeed, one quality all the founders shared this summer was a spirit of independence. I've been wondering about that. Are some people just a lot more independent than others, or would everyone be this way if they were allowed
|
||||
to?
|
||||
|
||||
|
||||
As with most nature/nurture questions, the answer is probably: some of each. But my main conclusion from the summer is that there's more environment in the mix than most people realize. I could see that from how the founders' attitudes _changed_ during the summer. Most were emerging from twenty or so years of being told what to do. They seemed a little surprised at having total freedom. But they grew into it really quickly; some of these guys now seem about four inches taller (metaphorically) than they did at the beginning of the summer.
|
||||
|
||||
When we asked the summer founders what surprised them most about starting a company, one said "the most shocking thing is that it worked."
|
||||
|
||||
It will take more experience to know for sure, but my guess is that a lot of hackers could do this-- that if you put people in a position of independence, they develop the qualities they need. Throw them off a cliff, and most will find on the way down that they have wings.
|
||||
|
||||
The reason this is news to anyone is that the same forces work in the other direction too. Most hackers are employees, and this [molds](http://software.ericsink.com/entries/No_Great_Hackers.html) you into someone to whom starting a startup seems impossible as surely as starting a startup molds you into someone who can handle it.
|
||||
|
||||
If I'm right, "hacker" will mean something different in twenty years than it does now. Increasingly it will mean the people who run the company. Y Combinator is just accelerating a process that would have happened anyway. Power is shifting from the people who deal with money to the people who create technology, and if our experience this summer is any guide, this will be a good thing.
|
||||
|
||||
|
||||
---
|
||||
**Notes**
|
||||
|
||||
[1] By heavy-duty security I mean efforts to protect against truly determined attackers.
|
||||
|
||||
The [image](https://sep.yimg.com/ty/cdn/paulgraham/sfptable.jpg?t=1595850613&) shows us, the 2005 summer founders, and Smartleaf co-founders Mark Nitzberg and Olin Shivers at the 30-foot table Kate Courteau designed for us. Photo by Alex Lewin.
|
||||
|
||||
**Thanks** to Sarah Harlin, Steve Huffman, Jessica Livingston, Zak Stone, and Aaron Swartz for reading drafts of this.
|
||||
|
||||
----
|
||||
[Romanian Translation](http://ro.goobix.com/pg/sfp/)
|
||||
|
||||
[Japanese Translation](http://d.hatena.ne.jp/lionfan/20060112)
|
||||
|
||||
* * *
|
||||
351
tests/data/markdown/what_i_worked_on.md
Normal file
351
tests/data/markdown/what_i_worked_on.md
Normal file
@@ -0,0 +1,351 @@
|
||||
# 198 What I Worked On
|
||||
|
||||
|
||||
February 2021
|
||||
|
||||
Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.
|
||||
|
||||
The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines CPU, disk drives, printer, card reader sitting up on a raised floor under bright fluorescent lights.
|
||||
|
||||
The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it. The result would ordinarily be to print something on the spectacularly loud printer.
|
||||
|
||||
I was puzzled by the 1401. I couldn't figure out what to do with it. And in retrospect there's not much I could have done with it. The only form of input to programs was data stored on punched cards, and I didn't have any data stored on punched cards. The only other option was to do things that didn't rely on any input, like calculate approximations of pi, but I didn't know enough math to do anything interesting of that type. So I'm not surprised I can't remember any programs I wrote, because they can't have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn't. On a machine without time-sharing, this was a social as well as a technical error, as the data center manager's expression made clear.
|
||||
|
||||
With microcomputers, everything changed. Now you could have a computer sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punch cards and then stopping. [1]
|
||||
|
||||
The first of my friends to get a microcomputer built it himself. It was sold as a kit by Heathkit. I remember vividly how impressed and envious I felt watching him sitting in front of it, typing programs right into the computer.
|
||||
|
||||
Computers were expensive in those days and it took me years of nagging before I convinced my father to buy one, a TRS-80, in about 1980. The gold standard then was the Apple II, but a TRS-80 was good enough. This was when I really started programming. I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book. There was only room in memory for about 2 pages of text, so he'd write 2 pages at a time and then print them out, but it was a lot better than a typewriter.
|
||||
|
||||
Though I liked programming, I didn't plan to study it in college. In college I was going to study philosophy, which sounded much more powerful. It seemed, to my naive high school self, to be the study of the ultimate truths, compared to which the things studied in other fields would be mere domain knowledge. What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn't much left for these supposed ultimate truths. All that seemed left for philosophy were edge cases that people in other fields felt could safely be ignored.
|
||||
|
||||
I couldn't have put this into words when I was 18. All I knew at the time was that I kept taking philosophy courses and they kept being boring. So I decided to switch to AI.
|
||||
|
||||
AI was in the air in the mid 1980s, but there were two things especially that made me want to work on it: a novel by Heinlein called _The Moon is a Harsh Mistress_ , which featured an intelligent computer called Mike, and a PBS documentary that showed Terry Winograd using SHRDLU. I haven't tried rereading _The Moon is a Harsh Mistress_ , so I don't know how well it has aged, but when I read it I was drawn entirely into its world. It seemed only a matter of time before we'd have Mike, and when I saw Winograd using SHRDLU, it seemed like that time would be a few years at most. All you had to do was teach SHRDLU more words.
|
||||
|
||||
There weren't any classes in AI at Cornell then, not even graduate classes, so I started trying to teach myself. Which meant learning Lisp, since in those days Lisp was regarded as the language of AI. The commonly used programming languages then were pretty primitive, and programmers' ideas correspondingly so. The default language at Cornell was a Pascal-like language called PL/I, and the situation was similar elsewhere. Learning Lisp expanded my concept of a program so fast that it was years before I started to have a sense of where the new limits were. This was more like it; this was what I had expected college to do. It wasn't happening in a class, like it was supposed to, but that was ok. For the next couple years I was on a roll. I knew what I was going to do.
|
||||
|
||||
For my undergraduate thesis, I reverse-engineered SHRDLU. My God did I love working on that program. It was a pleasing bit of code, but what made it even more exciting was my belief hard to imagine now, but not unique in 1985 that it was already climbing the lower slopes of intelligence.
|
||||
|
||||
I had gotten into a program at Cornell that didn't make you choose a major. You could take whatever classes you liked, and choose whatever you liked to put on your degree. I of course chose "Artificial Intelligence." When I got the actual physical diploma, I was dismayed to find that the quotes had been included, which made them read as scare-quotes. At the time this bothered me, but now it seems amusingly accurate, for reasons I was about to discover.
|
||||
|
||||
I applied to 3 grad schools: MIT and Yale, which were renowned for AI at the time, and Harvard, which I'd visited because Rich Draves went there, and was also home to Bill Woods, who'd invented the type of parser I used in my SHRDLU clone. Only Harvard accepted me, so that was where I went.
|
||||
|
||||
I don't remember the moment it happened, or if there even was a specific moment, but during the first year of grad school I realized that AI, as practiced at the time, was a hoax. By which I mean the sort of AI in which a program that's told "the dog is sitting on the chair" translates this into some formal representation and adds it to the list of things it knows.
|
||||
|
||||
What these programs really showed was that there's a subset of natural language that's a formal language. But a very proper subset. It was clear that there was an unbridgeable gap between what they could do and actually understanding natural language. It was not, in fact, simply a matter of teaching SHRDLU more words. That whole way of doing AI, with explicit data structures representing concepts, was not going to work. Its brokenness did, as so often happens, generate a lot of opportunities to write papers about various band-aids that could be applied to it, but it was never going to get us Mike.
|
||||
|
||||
So I looked around to see what I could salvage from the wreckage of my plans, and there was Lisp. I knew from experience that Lisp was interesting for its own sake and not just for its association with AI, even though that was the main reason people cared about it at the time. So I decided to focus on Lisp. In fact, I decided to write a book about Lisp hacking. It's scary to think how little I knew about Lisp hacking when I started writing that book. But there's nothing like writing a book about something to help you learn it. The book, _On Lisp_ , wasn't published till 1993, but I wrote much of it in grad school.
|
||||
|
||||
Computer Science is an uneasy alliance between two halves, theory and systems. The theory people prove things, and the systems people build things. I wanted to build things. I had plenty of respect for theory indeed, a sneaking suspicion that it was the more admirable of the two halves but building things seemed so much more exciting.
|
||||
|
||||
The problem with systems work, though, was that it didn't last. Any program you wrote today, no matter how good, would be obsolete in a couple decades at best. People might mention your software in footnotes, but no one would actually use it. And indeed, it would seem very feeble work. Only people with a sense of the history of the field would even realize that, in its time, it had been good.
|
||||
|
||||
There were some surplus Xerox Dandelions floating around the computer lab at one point. Anyone who wanted one to play around with could have one. I was briefly tempted, but they were so slow by present standards; what was the point? No one else wanted one either, so off they went. That was what happened to systems work.
|
||||
|
||||
I wanted not just to build things, but to build things that would last.
|
||||
|
||||
In this dissatisfied state I went in 1988 to visit Rich Draves at CMU, where he was in grad school. One day I went to visit the Carnegie Institute, where I'd spent a lot of time as a kid. While looking at a painting there I realized something that might seem obvious, but was a big surprise to me. There, right on the wall, was something you could make that would last. Paintings didn't become obsolete. Some of the best ones were hundreds of years old.
|
||||
|
||||
And moreover this was something you could make a living doing. Not as easily as you could by writing software, of course, but I thought if you were really industrious and lived really cheaply, it had to be possible to make enough to survive. And as an artist you could be truly independent. You wouldn't have a boss, or even need to get research funding.
|
||||
|
||||
I had always liked looking at paintings. Could I make them? I had no idea. I'd never imagined it was even possible. I knew intellectually that people made art that it didn't just appear spontaneously but it was as if the people who made it were a different species. They either lived long ago or were mysterious geniuses doing strange things in profiles in _Life_ magazine. The idea of actually being able to make art, to put that verb before that noun, seemed almost miraculous.
|
||||
|
||||
That fall I started taking art classes at Harvard. Grad students could take classes in any department, and my advisor, Tom Cheatham, was very easy going. If he even knew about the strange classes I was taking, he never said anything.
|
||||
|
||||
So now I was in a PhD program in computer science, yet planning to be an artist, yet also genuinely in love with Lisp hacking and working away at _On Lisp_. In other words, like many a grad student, I was working energetically on multiple projects that were not my thesis.
|
||||
|
||||
I didn't see a way out of this situation. I didn't want to drop out of grad school, but how else was I going to get out? I remember when my friend Robert Morris got kicked out of Cornell for writing the internet worm of 1988, I was envious that he'd found such a spectacular way to get out of grad school.
|
||||
|
||||
Then one day in April 1990 a crack appeared in the wall. I ran into professor Cheatham and he asked if I was far enough along to graduate that June. I didn't have a word of my dissertation written, but in what must have been the quickest bit of thinking in my life, I decided to take a shot at writing one in the 5 weeks or so that remained before the deadline, reusing parts of _On Lisp_ where I could, and I was able to respond, with no perceptible delay "Yes, I think so. I'll give you something to read in a few days."
|
||||
|
||||
I picked applications of continuations as the topic. In retrospect I should have written about macros and embedded languages. There's a whole world there that's barely been explored. But all I wanted was to get out of grad school, and my rapidly written dissertation sufficed, just barely.
|
||||
|
||||
Meanwhile I was applying to art schools. I applied to two: RISD in the US, and the Accademia di Belli Arti in Florence, which, because it was the oldest art school, I imagined would be good. RISD accepted me, and I never heard back from the Accademia, so off to Providence I went.
|
||||
|
||||
I'd applied for the BFA program at RISD, which meant in effect that I had to go to college again. This was not as strange as it sounds, because I was only 25, and art schools are full of people of different ages. RISD counted me as a transfer sophomore and said I had to do the foundation that summer. The foundation means the classes that everyone has to take in fundamental subjects like drawing, color, and design.
|
||||
|
||||
Toward the end of the summer I got a big surprise: a letter from the Accademia, which had been delayed because they'd sent it to Cambridge England instead of Cambridge Massachusetts, inviting me to take the entrance exam in Florence that fall. This was now only weeks away. My nice landlady let me leave my stuff in her attic. I had some money saved from consulting work I'd done in grad school; there was probably enough to last a year if I lived cheaply. Now all I had to do was learn Italian.
|
||||
|
||||
Only _stranieri_ (foreigners) had to take this entrance exam. In retrospect it may well have been a way of excluding them, because there were so many _stranieri_ attracted by the idea of studying art in Florence that the Italian students would otherwise have been outnumbered. I was in decent shape at painting and drawing from the RISD foundation that summer, but I still don't know how I managed to pass the written exam. I remember that I answered the essay question by writing about Cezanne, and that I cranked up the intellectual level as high as I could to make the most of my limited vocabulary. [2]
|
||||
|
||||
I'm only up to age 25 and already there are such conspicuous patterns. Here I was, yet again about to attend some august institution in the hopes of learning about some prestigious subject, and yet again about to be disappointed. The students and faculty in the painting department at the Accademia were the nicest people you could imagine, but they had long since arrived at an arrangement whereby the students wouldn't require the faculty to teach anything, and in return the faculty wouldn't require the students to learn anything. And at the same time all involved would adhere outwardly to the conventions of a 19th century atelier. We actually had one of those little stoves, fed with kindling, that you see in 19th century studio paintings, and a nude model sitting as close to it as possible without getting burned. Except hardly anyone else painted her besides me. The rest of the students spent their time chatting or occasionally trying to imitate things they'd seen in American art magazines.
|
||||
|
||||
Our model turned out to live just down the street from me. She made a living from a combination of modelling and making fakes for a local antique dealer. She'd copy an obscure old painting out of a book, and then he'd take the copy and maltreat it to make it look old. [3]
|
||||
|
||||
While I was a student at the Accademia I started painting still lives in my bedroom at night. These paintings were tiny, because the room was, and because I painted them on leftover scraps of canvas, which was all I could afford at the time. Painting still lives is different from painting people, because the subject, as its name suggests, can't move. People can't sit for more than about 15 minutes at a time, and when they do they don't sit very still. So the traditional m.o. for painting people is to know how to paint a generic person, which you then modify to match the specific person you're painting. Whereas a still life you can, if you want, copy pixel by pixel from what you're seeing. You don't want to stop there, of course, or you get merely photographic accuracy, and what makes a still life interesting is that it's been through a head. You want to emphasize the visual cues that tell you, for example, that the reason the color changes suddenly at a certain point is that it's the edge of an object. By subtly emphasizing such things you can make paintings that are more realistic than photographs not just in some metaphorical sense, but in the strict information-theoretic sense. [4]
|
||||
|
||||
I liked painting still lives because I was curious about what I was seeing. In everyday life, we aren't consciously aware of much we're seeing. Most visual perception is handled by low-level processes that merely tell your brain "that's a water droplet" without telling you details like where the lightest and darkest points are, or "that's a bush" without telling you the shape and position of every leaf. This is a feature of brains, not a bug. In everyday life it would be distracting to notice every leaf on every bush. But when you have to paint something, you have to look more closely, and when you do there's a lot to see. You can still be noticing new things after days of trying to paint something people usually take for granted, just as you can after days of trying to write an essay about something people usually take for granted.
|
||||
|
||||
This is not the only way to paint. I'm not 100% sure it's even a good way to paint. But it seemed a good enough bet to be worth trying.
|
||||
|
||||
Our teacher, professor Ulivi, was a nice guy. He could see I worked hard, and gave me a good grade, which he wrote down in a sort of passport each student had. But the Accademia wasn't teaching me anything except Italian, and my money was running out, so at the end of the first year I went back to the US.
|
||||
|
||||
I wanted to go back to RISD, but I was now broke and RISD was very expensive, so I decided to get a job for a year and then return to RISD the next fall. I got one at a company called Interleaf, which made software for creating documents. You mean like Microsoft Word? Exactly. That was how I learned that low end software tends to eat high end software. But Interleaf still had a few years to live yet. [5]
|
||||
|
||||
Interleaf had done something pretty bold. Inspired by Emacs, they'd added a scripting language, and even made the scripting language a dialect of Lisp. Now they wanted a Lisp hacker to write things in it. This was the closest thing I've had to a normal job, and I hereby apologize to my boss and coworkers, because I was a bad employee. Their Lisp was the thinnest icing on a giant C cake, and since I didn't know C and didn't want to learn it, I never understood most of the software. Plus I was terribly irresponsible. This was back when a programming job meant showing up every day during certain working hours. That seemed unnatural to me, and on this point the rest of the world is coming around to my way of thinking, but at the time it caused a lot of friction. Toward the end of the year I spent much of my time surreptitiously working on _On Lisp_ , which I had by this time gotten a contract to publish.
|
||||
|
||||
The good part was that I got paid huge amounts of money, especially by art student standards. In Florence, after paying my part of the rent, my budget for everything else had been $7 a day. Now I was getting paid more than 4 times that every hour, even when I was just sitting in a meeting. By living cheaply I not only managed to save enough to go back to RISD, but also paid off my college loans.
|
||||
|
||||
I learned some useful things at Interleaf, though they were mostly about what not to do. I learned that it's better for technology companies to be run by product people than sales people (though sales is a real skill and people who are good at it are really good at it), that it leads to bugs when code is edited by too many people, that cheap office space is no bargain if it's depressing, that planned meetings are inferior to corridor conversations, that big, bureaucratic customers are a dangerous source of money, and that there's not much overlap between conventional office hours and the optimal time for hacking, or conventional offices and the optimal place for it.
|
||||
|
||||
But the most important thing I learned, and which I used in both Viaweb and Y Combinator, is that the low end eats the high end: that it's good to be the "entry level" option, even though that will be less prestigious, because if you're not, someone else will be, and will squash you against the ceiling. Which in turn means that prestige is a danger sign.
|
||||
|
||||
When I left to go back to RISD the next fall, I arranged to do freelance work for the group that did projects for customers, and this was how I survived for the next several years. When I came back to visit for a project later on, someone told me about a new thing called HTML, which was, as he described it, a derivative of SGML. Markup language enthusiasts were an occupational hazard at Interleaf and I ignored him, but this HTML thing later became a big part of my life.
|
||||
|
||||
In the fall of 1992 I moved back to Providence to continue at RISD. The foundation had merely been intro stuff, and the Accademia had been a (very civilized) joke. Now I was going to see what real art school was like. But alas it was more like the Accademia than not. Better organized, certainly, and a lot more expensive, but it was now becoming clear that art school did not bear the same relationship to art that medical school bore to medicine. At least not the painting department. The textile department, which my next door neighbor belonged to, seemed to be pretty rigorous. No doubt illustration and architecture were too. But painting was post-rigorous. Painting students were supposed to express themselves, which to the more worldly ones meant to try to cook up some sort of distinctive signature style.
|
||||
|
||||
A signature style is the visual equivalent of what in show business is known as a "schtick": something that immediately identifies the work as yours and no one else's. For example, when you see a painting that looks like a certain kind of cartoon, you know it's by Roy Lichtenstein. So if you see a big painting of this type hanging in the apartment of a hedge fund manager, you know he paid millions of dollars for it. That's not always why artists have a signature style, but it's usually why buyers pay a lot for such work. [6]
|
||||
|
||||
There were plenty of earnest students too: kids who "could draw" in high school, and now had come to what was supposed to be the best art school in the country, to learn to draw even better. They tended to be confused and demoralized by what they found at RISD, but they kept going, because painting was what they did. I was not one of the kids who could draw in high school, but at RISD I was definitely closer to their tribe than the tribe of signature style seekers.
|
||||
|
||||
I learned a lot in the color class I took at RISD, but otherwise I was basically teaching myself to paint, and I could do that for free. So in 1993 I dropped out. I hung around Providence for a bit, and then my college friend Nancy Parmet did me a big favor. A rent-controlled apartment in a building her mother owned in New York was becoming vacant. Did I want it? It wasn't much more than my current place, and New York was supposed to be where the artists were. So yes, I wanted it! [7]
|
||||
|
||||
Asterix comics begin by zooming in on a tiny corner of Roman Gaul that turns out not to be controlled by the Romans. You can do something similar on a map of New York City: if you zoom in on the Upper East Side, there's a tiny corner that's not rich, or at least wasn't in 1993. It's called Yorkville, and that was my new home. Now I was a New York artist in the strictly technical sense of making paintings and living in New York.
|
||||
|
||||
I was nervous about money, because I could sense that Interleaf was on the way down. Freelance Lisp hacking work was very rare, and I didn't want to have to program in another language, which in those days would have meant C++ if I was lucky. So with my unerring nose for financial opportunity, I decided to write another book on Lisp. This would be a popular book, the sort of book that could be used as a textbook. I imagined myself living frugally off the royalties and spending all my time painting. (The painting on the cover of this book, _ANSI Common Lisp_ , is one that I painted around this time.)
|
||||
|
||||
The best thing about New York for me was the presence of Idelle and Julian Weber. Idelle Weber was a painter, one of the early photorealists, and I'd taken her painting class at Harvard. I've never known a teacher more beloved by her students. Large numbers of former students kept in touch with her, including me. After I moved to New York I became her de facto studio assistant.
|
||||
|
||||
She liked to paint on big, square canvases, 4 to 5 feet on a side. One day in late 1994 as I was stretching one of these monsters there was something on the radio about a famous fund manager. He wasn't that much older than me, and was super rich. The thought suddenly occurred to me: why don't I become rich? Then I'll be able to work on whatever I want.
|
||||
|
||||
Meanwhile I'd been hearing more and more about this new thing called the World Wide Web. Robert Morris showed it to me when I visited him in Cambridge, where he was now in grad school at Harvard. It seemed to me that the web would be a big deal. I'd seen what graphical user interfaces had done for the popularity of microcomputers. It seemed like the web would do the same for the internet.
|
||||
|
||||
If I wanted to get rich, here was the next train leaving the station. I was right about that part. What I got wrong was the idea. I decided we should start a company to put art galleries online. I can't honestly say, after reading so many Y Combinator applications, that this was the worst startup idea ever, but it was up there. Art galleries didn't want to be online, and still don't, not the fancy ones. That's not how they sell. I wrote some software to generate web sites for galleries, and Robert wrote some to resize images and set up an http server to serve the pages. Then we tried to sign up galleries. To call this a difficult sale would be an understatement. It was difficult to give away. A few galleries let us make sites for them for free, but none paid us.
|
||||
|
||||
Then some online stores started to appear, and I realized that except for the order buttons they were identical to the sites we'd been generating for galleries. This impressive-sounding thing called an "internet storefront" was something we already knew how to build.
|
||||
|
||||
So in the summer of 1995, after I submitted the camera-ready copy of _ANSI Common Lisp_ to the publishers, we started trying to write software to build online stores. At first this was going to be normal desktop software, which in those days meant Windows software. That was an alarming prospect, because neither of us knew how to write Windows software or wanted to learn. We lived in the Unix world. But we decided we'd at least try writing a prototype store builder on Unix. Robert wrote a shopping cart, and I wrote a new site generator for stores in Lisp, of course.
|
||||
|
||||
We were working out of Robert's apartment in Cambridge. His roommate was away for big chunks of time, during which I got to sleep in his room. For some reason there was no bed frame or sheets, just a mattress on the floor. One morning as I was lying on this mattress I had an idea that made me sit up like a capital L. What if we ran the software on the server, and let users control it by clicking on links? Then we'd never have to write anything to run on users' computers. We could generate the sites on the same server we'd serve them from. Users wouldn't need anything more than a browser.
|
||||
|
||||
This kind of software, known as a web app, is common now, but at the time it wasn't clear that it was even possible. To find out, we decided to try making a version of our store builder that you could control through the browser. A couple days later, on August 12, we had one that worked. The UI was horrible, but it proved you could build a whole store through the browser, without any client software or typing anything into the command line on the server.
|
||||
|
||||
Now we felt like we were really onto something. I had visions of a whole new generation of software working this way. You wouldn't need versions, or ports, or any of that crap. At Interleaf there had been a whole group called Release Engineering that seemed to be at least as big as the group that actually wrote the software. Now you could just update the software right on the server.
|
||||
|
||||
We started a new company we called Viaweb, after the fact that our software worked via the web, and we got $10,000 in seed funding from Idelle's husband Julian. In return for that and doing the initial legal work and giving us business advice, we gave him 10% of the company. Ten years later this deal became the model for Y Combinator's. We knew founders needed something like this, because we'd needed it ourselves.
|
||||
|
||||
At this stage I had a negative net worth, because the thousand dollars or so I had in the bank was more than counterbalanced by what I owed the government in taxes. (Had I diligently set aside the proper proportion of the money I'd made consulting for Interleaf? No, I had not.) So although Robert had his graduate student stipend, I needed that seed funding to live on.
|
||||
|
||||
We originally hoped to launch in September, but we got more ambitious about the software as we worked on it. Eventually we managed to build a WYSIWYG site builder, in the sense that as you were creating pages, they looked exactly like the static ones that would be generated later, except that instead of leading to static pages, the links all referred to closures stored in a hash table on the server.
|
||||
|
||||
It helped to have studied art, because the main goal of an online store builder is to make users look legit, and the key to looking legit is high production values. If you get page layouts and fonts and colors right, you can make a guy running a store out of his bedroom look more legit than a big company.
|
||||
|
||||
(If you're curious why my site looks so old-fashioned, it's because it's still made with this software. It may look clunky today, but in 1996 it was the last word in slick.)
|
||||
|
||||
In September, Robert rebelled. "We've been working on this for a month," he said, "and it's still not done." This is funny in retrospect, because he would still be working on it almost 3 years later. But I decided it might be prudent to recruit more programmers, and I asked Robert who else in grad school with him was really good. He recommended Trevor Blackwell, which surprised me at first, because at that point I knew Trevor mainly for his plan to reduce everything in his life to a stack of notecards, which he carried around with him. But Rtm was right, as usual. Trevor turned out to be a frighteningly effective hacker.
|
||||
|
||||
It was a lot of fun working with Robert and Trevor. They're the two most [_independent-minded_](think.html) people I know, and in completely different ways. If you could see inside Rtm's brain it would look like a colonial New England church, and if you could see inside Trevor's it would look like the worst excesses of Austrian Rococo.
|
||||
|
||||
We opened for business, with 6 stores, in January 1996. It was just as well we waited a few months, because although we worried we were late, we were actually almost fatally early. There was a lot of talk in the press then about ecommerce, but not many people actually wanted online stores. [8]
|
||||
|
||||
There were three main parts to the software: the editor, which people used to build sites and which I wrote, the shopping cart, which Robert wrote, and the manager, which kept track of orders and statistics, and which Trevor wrote. In its time, the editor was one of the best general-purpose site builders. I kept the code tight and didn't have to integrate with any other software except Robert's and Trevor's, so it was quite fun to work on. If all I'd had to do was work on this software, the next 3 years would have been the easiest of my life. Unfortunately I had to do a lot more, all of it stuff I was worse at than programming, and the next 3 years were instead the most stressful.
|
||||
|
||||
There were a lot of startups making ecommerce software in the second half of the 90s. We were determined to be the Microsoft Word, not the Interleaf. Which meant being easy to use and inexpensive. It was lucky for us that we were poor, because that caused us to make Viaweb even more inexpensive than we realized. We charged $100 a month for a small store and $300 a month for a big one. This low price was a big attraction, and a constant thorn in the sides of competitors, but it wasn't because of some clever insight that we set the price low. We had no idea what businesses paid for things. $300 a month seemed like a lot of money to us.
|
||||
|
||||
We did a lot of things right by accident like that. For example, we did what's now called "doing things that [_don't scale_](ds.html)," although at the time we would have described it as "being so lame that we're driven to the most desperate measures to get users." The most common of which was building stores for them. This seemed particularly humiliating, since the whole raison d'etre of our software was that people could use it to make their own stores. But anything to get users.
|
||||
|
||||
We learned a lot more about retail than we wanted to know. For example, that if you could only have a small image of a man's shirt (and all images were small then by present standards), it was better to have a closeup of the collar than a picture of the whole shirt. The reason I remember learning this was that it meant I had to rescan about 30 images of men's shirts. My first set of scans were so beautiful too.
|
||||
|
||||
Though this felt wrong, it was exactly the right thing to be doing. Building stores for users taught us about retail, and about how it felt to use our software. I was initially both mystified and repelled by "business" and thought we needed a "business person" to be in charge of it, but once we started to get users, I was converted, in much the same way I was converted to [_fatherhood_](kids.html) once I had kids. Whatever users wanted, I was all theirs. Maybe one day we'd have so many users that I couldn't scan their images for them, but in the meantime there was nothing more important to do.
|
||||
|
||||
Another thing I didn't get at the time is that [_growth rate_](growth.html) is the ultimate test of a startup. Our growth rate was fine. We had about 70 stores at the end of 1996 and about 500 at the end of 1997. I mistakenly thought the thing that mattered was the absolute number of users. And that is the thing that matters in the sense that that's how much money you're making, and if you're not making enough, you might go out of business. But in the long term the growth rate takes care of the absolute number. If we'd been a startup I was advising at Y Combinator, I would have said: Stop being so stressed out, because you're doing fine. You're growing 7x a year. Just don't hire too many more people and you'll soon be profitable, and then you'll control your own destiny.
|
||||
|
||||
Alas I hired lots more people, partly because our investors wanted me to, and partly because that's what startups did during the Internet Bubble. A company with just a handful of employees would have seemed amateurish. So we didn't reach breakeven until about when Yahoo bought us in the summer of 1998. Which in turn meant we were at the mercy of investors for the entire life of the company. And since both we and our investors were noobs at startups, the result was a mess even by startup standards.
|
||||
|
||||
It was a huge relief when Yahoo bought us. In principle our Viaweb stock was valuable. It was a share in a business that was profitable and growing rapidly. But it didn't feel very valuable to me; I had no idea how to value a business, but I was all too keenly aware of the near-death experiences we seemed to have every few months. Nor had I changed my grad student lifestyle significantly since we started. So when Yahoo bought us it felt like going from rags to riches. Since we were going to California, I bought a car, a yellow 1998 VW GTI. I remember thinking that its leather seats alone were by far the most luxurious thing I owned.
|
||||
|
||||
The next year, from the summer of 1998 to the summer of 1999, must have been the least productive of my life. I didn't realize it at the time, but I was worn out from the effort and stress of running Viaweb. For a while after I got to California I tried to continue my usual m.o. of programming till 3 in the morning, but fatigue combined with Yahoo's prematurely aged [_culture_](yahoo.html) and grim cube farm in Santa Clara gradually dragged me down. After a few months it felt disconcertingly like working at Interleaf.
|
||||
|
||||
Yahoo had given us a lot of options when they bought us. At the time I thought Yahoo was so overvalued that they'd never be worth anything, but to my astonishment the stock went up 5x in the next year. I hung on till the first chunk of options vested, then in the summer of 1999 I left. It had been so long since I'd painted anything that I'd half forgotten why I was doing this. My brain had been entirely full of software and men's shirts for 4 years. But I had done this to get rich so I could paint, I reminded myself, and now I was rich, so I should go paint.
|
||||
|
||||
When I said I was leaving, my boss at Yahoo had a long conversation with me about my plans. I told him all about the kinds of pictures I wanted to paint. At the time I was touched that he took such an interest in me. Now I realize it was because he thought I was lying. My options at that point were worth about $2 million a month. If I was leaving that kind of money on the table, it could only be to go and start some new startup, and if I did, I might take people with me. This was the height of the Internet Bubble, and Yahoo was ground zero of it. My boss was at that moment a billionaire. Leaving then to start a new startup must have seemed to him an insanely, and yet also plausibly, ambitious plan.
|
||||
|
||||
But I really was quitting to paint, and I started immediately. There was no time to lose. I'd already burned 4 years getting rich. Now when I talk to founders who are leaving after selling their companies, my advice is always the same: take a vacation. That's what I should have done, just gone off somewhere and done nothing for a month or two, but the idea never occurred to
|
||||
me.
|
||||
|
||||
|
||||
So I tried to paint, but I just didn't seem to have any energy or ambition. Part of the problem was that I didn't know many people in California. I'd compounded this problem by buying a house up in the Santa Cruz Mountains, with a beautiful view but miles from anywhere. I stuck it out for a few more months, then in desperation I went back to New York, where unless you understand about rent control you'll be surprised to hear I still had my apartment, sealed up like a tomb of my old life. Idelle was in New York at least, and there were other people trying to paint there, even though I didn't know any of them.
|
||||
|
||||
When I got back to New York I resumed my old life, except now I was rich. It was as weird as it sounds. I resumed all my old patterns, except now there were doors where there hadn't been. Now when I was tired of walking, all I had to do was raise my hand, and (unless it was raining) a taxi would stop to pick me up. Now when I walked past charming little restaurants I could go in and order lunch. It was exciting for a while. Painting started to go better. I experimented with a new kind of still life where I'd paint one painting in the old way, then photograph it and print it, blown up, on canvas, and then use that as the underpainting for a second still life, painted from the same objects (which hopefully hadn't rotted yet).
|
||||
|
||||
Meanwhile I looked for an apartment to buy. Now I could actually choose what neighborhood to live in. Where, I asked myself and various real estate agents, is the Cambridge of New York? Aided by occasional visits to actual Cambridge, I gradually realized there wasn't one. Huh.
|
||||
|
||||
Around this time, in the spring of 2000, I had an idea. It was clear from our experience with Viaweb that web apps were the future. Why not build a web app for making web apps? Why not let people edit code on our server through the browser, and then host the resulting applications for them? [9] You could run all sorts of services on the servers that these applications could use just by making an API call: making and receiving phone calls, manipulating images, taking credit card payments, etc.
|
||||
|
||||
I got so excited about this idea that I couldn't think about anything else. It seemed obvious that this was the future. I didn't particularly want to start another company, but it was clear that this idea would have to be embodied as one, so I decided to move to Cambridge and start it. I hoped to lure Robert into working on it with me, but there I ran into a hitch. Robert was now a postdoc at MIT, and though he'd made a lot of money the last time I'd lured him into working on one of my schemes, it had also been a huge time sink. So while he agreed that it sounded like a plausible idea, he firmly refused to work on it.
|
||||
|
||||
Hmph. Well, I'd do it myself then. I recruited Dan Giffin, who had worked for Viaweb, and two undergrads who wanted summer jobs, and we got to work trying to build what it's now clear is about twenty companies and several open source projects worth of software. The language for defining applications would of course be a dialect of Lisp. But I wasn't so naive as to assume I could spring an overt Lisp on a general audience; we'd hide the parentheses, like Dylan
|
||||
did.
|
||||
|
||||
|
||||
By then there was a name for the kind of company Viaweb was, an "application service provider," or ASP. This name didn't last long before it was replaced by "software as a service," but it was current for long enough that I named this new company after it: it was going to be called Aspra.
|
||||
|
||||
I started working on the application builder, Dan worked on network infrastructure, and the two undergrads worked on the first two services (images and phone calls). But about halfway through the summer I realized I really didn't want to run a company especially not a big one, which it was looking like this would have to be. I'd only started Viaweb because I needed the money. Now that I didn't need money anymore, why was I doing this? If this vision had to be realized as a company, then screw the vision. I'd build a subset that could be done as an open source project.
|
||||
|
||||
Much to my surprise, the time I spent working on this stuff was not wasted after all. After we started Y Combinator, I would often encounter startups working on parts of this new architecture, and it was very useful to have spent so much time thinking about it and even trying to write some of it.
|
||||
|
||||
The subset I would build as an open source project was the new Lisp, whose parentheses I now wouldn't even have to hide. A lot of Lisp hackers dream of building a new Lisp, partly because one of the distinctive features of the language is that it has dialects, and partly, I think, because we have in our minds a Platonic form of Lisp that all existing dialects fall short of. I certainly did. So at the end of the summer Dan and I switched to working on this new dialect of Lisp, which I called Arc, in a house I bought in Cambridge.
|
||||
|
||||
The following spring, lightning struck. I was invited to give a talk at a Lisp conference, so I gave one about how we'd used Lisp at Viaweb. Afterward I put a postscript file of this talk online, on paulgraham.com, which I'd created years before using Viaweb but had never used for anything. In one day it got 30,000 page views. What on earth had happened? The referring urls showed that someone had posted it on Slashdot. [10]
|
||||
|
||||
Wow, I thought, there's an audience. If I write something and put it on the web, anyone can read it. That may seem obvious now, but it was surprising then. In the print era there was a narrow channel to readers, guarded by fierce monsters known as editors. The only way to get an audience for anything you wrote was to get it published as a book, or in a newspaper or magazine. Now anyone could publish anything.
|
||||
|
||||
This had been possible in principle since 1993, but not many people had realized it yet. I had been intimately involved with building the infrastructure of the web for most of that time, and a writer as well, and it had taken me 8 years to realize it. Even then it took me several years to understand the implications. It meant there would be a whole new generation of [_essays_](essay.html). [11]
|
||||
|
||||
In the print era, the channel for publishing essays had been vanishingly small. Except for a few officially anointed thinkers who went to the right parties in New York, the only people allowed to publish essays were specialists writing about their specialties. There were so many essays that had never been written, because there had been no way to publish them. Now they could be, and I was going to write them. [12]
|
||||
|
||||
I've worked on several different things, but to the extent there was a turning point where I figured out what to work on, it was when I started publishing essays online. From then on I knew that whatever else I did, I'd always write essays too.
|
||||
|
||||
I knew that online essays would be a [_marginal_](marginal.html) medium at first. Socially they'd seem more like rants posted by nutjobs on their GeoCities sites than the genteel and beautifully typeset compositions published in _The New Yorker_. But by this point I knew enough to find that encouraging instead of discouraging.
|
||||
|
||||
One of the most conspicuous patterns I've noticed in my life is how well it has worked, for me at least, to work on things that weren't prestigious. Still life has always been the least prestigious form of painting. Viaweb and Y Combinator both seemed lame when we started them. I still get the glassy eye from strangers when they ask what I'm writing, and I explain that it's an essay I'm going to publish on my web site. Even Lisp, though prestigious intellectually in something like the way Latin is, also seems about as hip.
|
||||
|
||||
It's not that unprestigious types of work are good per se. But when you find yourself drawn to some kind of work despite its current lack of prestige, it's a sign both that there's something real to be discovered there, and that you have the right kind of motives. Impure motives are a big danger for the ambitious. If anything is going to lead you astray, it will be the desire to impress people. So while working on things that aren't prestigious doesn't guarantee you're on the right track, it at least guarantees you're not on the most common type of wrong one.
|
||||
|
||||
Over the next several years I wrote lots of essays about all kinds of different topics. O'Reilly reprinted a collection of them as a book, called _Hackers & Painters_ after one of the essays in it. I also worked on spam filters, and did some more painting. I used to have dinners for a group of friends every thursday night, which taught me how to cook for groups. And I bought another building in Cambridge, a former candy factory (and later, twas said, porn studio), to use as an office.
|
||||
|
||||
One night in October 2003 there was a big party at my house. It was a clever idea of my friend Maria Daniels, who was one of the thursday diners. Three separate hosts would all invite their friends to one party. So for every guest, two thirds of the other guests would be people they didn't know but would probably like. One of the guests was someone I didn't know but would turn out to like a lot: a woman called Jessica Livingston. A couple days later I asked her out.
|
||||
|
||||
Jessica was in charge of marketing at a Boston investment bank. This bank thought it understood startups, but over the next year, as she met friends of mine from the startup world, she was surprised how different reality was. And how colorful their stories were. So she decided to compile a book of [_interviews_](https://www.amazon.com/Founders-Work-Stories-Startups- Early/dp/1430210788) with startup founders.
|
||||
|
||||
When the bank had financial problems and she had to fire half her staff, she started looking for a new job. In early 2005 she interviewed for a marketing job at a Boston VC firm. It took them weeks to make up their minds, and during this time I started telling her about all the things that needed to be fixed about venture capital. They should make a larger number of smaller investments instead of a handful of giant ones, they should be funding younger, more technical founders instead of MBAs, they should let the founders remain as CEO, and so on.
|
||||
|
||||
One of my tricks for writing essays had always been to give talks. The prospect of having to stand up in front of a group of people and tell them something that won't waste their time is a great spur to the imagination. When the Harvard Computer Society, the undergrad computer club, asked me to give a talk, I decided I would tell them how to start a startup. Maybe they'd be able to avoid the worst of the mistakes we'd made.
|
||||
|
||||
So I gave this talk, in the course of which I told them that the best sources of seed funding were successful startup founders, because then they'd be sources of advice too. Whereupon it seemed they were all looking expectantly at me. Horrified at the prospect of having my inbox flooded by business plans (if I'd only known), I blurted out "But not me!" and went on with the talk. But afterward it occurred to me that I should really stop procrastinating about angel investing. I'd been meaning to since Yahoo bought us, and now it was 7 years later and I still hadn't done one angel investment.
|
||||
|
||||
Meanwhile I had been scheming with Robert and Trevor about projects we could work on together. I missed working with them, and it seemed like there had to be something we could collaborate on.
|
||||
|
||||
As Jessica and I were walking home from dinner on March 11, at the corner of Garden and Walker streets, these three threads converged. Screw the VCs who were taking so long to make up their minds. We'd start our own investment firm and actually implement the ideas we'd been talking about. I'd fund it, and Jessica could quit her job and work for it, and we'd get Robert and Trevor as partners too. [13]
|
||||
|
||||
Once again, ignorance worked in our favor. We had no idea how to be angel investors, and in Boston in 2005 there were no Ron Conways to learn from. So we just made what seemed like the obvious choices, and some of the things we did turned out to be novel.
|
||||
|
||||
There are multiple components to Y Combinator, and we didn't figure them all out at once. The part we got first was to be an angel firm. In those days, those two words didn't go together. There were VC firms, which were organized companies with people whose job it was to make investments, but they only did big, million dollar investments. And there were angels, who did smaller investments, but these were individuals who were usually focused on other things and made investments on the side. And neither of them helped founders enough in the beginning. We knew how helpless founders were in some respects, because we remembered how helpless we'd been. For example, one thing Julian had done for us that seemed to us like magic was to get us set up as a company. We were fine writing fairly difficult software, but actually getting incorporated, with bylaws and stock and all that stuff, how on earth did you do that? Our plan was not only to make seed investments, but to do for startups everything Julian had done for us.
|
||||
|
||||
YC was not organized as a fund. It was cheap enough to run that we funded it with our own money. That went right by 99% of readers, but professional investors are thinking "Wow, that means they got all the returns." But once again, this was not due to any particular insight on our part. We didn't know how VC firms were organized. It never occurred to us to try to raise a fund, and if it had, we wouldn't have known where to start. [14]
|
||||
|
||||
The most distinctive thing about YC is the batch model: to fund a bunch of startups all at once, twice a year, and then to spend three months focusing intensively on trying to help them. That part we discovered by accident, not merely implicitly but explicitly due to our ignorance about investing. We needed to get experience as investors. What better way, we thought, than to fund a whole bunch of startups at once? We knew undergrads got temporary jobs at tech companies during the summer. Why not organize a summer program where they'd start startups instead? We wouldn't feel guilty for being in a sense fake investors, because they would in a similar sense be fake founders. So while we probably wouldn't make much money out of it, we'd at least get to practice being investors on them, and they for their part would probably have a more interesting summer than they would working at Microsoft.
|
||||
|
||||
We'd use the building I owned in Cambridge as our headquarters. We'd all have dinner there once a week on tuesdays, since I was already cooking for the thursday diners on thursdays and after dinner we'd bring in experts on startups to give talks.
|
||||
|
||||
We knew undergrads were deciding then about summer jobs, so in a matter of days we cooked up something we called the Summer Founders Program, and I posted an [_announcement_](summerfounder.html) on my site, inviting undergrads to apply. I had never imagined that writing essays would be a way to get "deal flow," as investors call it, but it turned out to be the perfect source. [15] We got 225 applications for the Summer Founders Program, and we were surprised to find that a lot of them were from people who'd already graduated, or were about to that spring. Already this SFP thing was starting to feel more serious than we'd intended.
|
||||
|
||||
We invited about 20 of the 225 groups to interview in person, and from those we picked 8 to fund. They were an impressive group. That first batch included reddit, Justin Kan and Emmett Shear, who went on to found Twitch, Aaron Swartz, who had already helped write the RSS spec and would a few years later become a martyr for open access, and Sam Altman, who would later become the second president of YC. I don't think it was entirely luck that the first batch was so good. You had to be pretty bold to sign up for a weird thing like the Summer Founders Program instead of a summer job at a legit place like Microsoft or Goldman Sachs.
|
||||
|
||||
The deal for startups was based on a combination of the deal we did with Julian ($10k for 10%) and what Robert said MIT grad students got for the summer ($6k). We invested $6k per founder, which in the typical two-founder case was $12k, in return for 6%. That had to be fair, because it was twice as good as the deal we ourselves had taken. Plus that first summer, which was really hot, Jessica brought the founders free air conditioners. [16]
|
||||
|
||||
Fairly quickly I realized that we had stumbled upon the way to scale startup funding. Funding startups in batches was more convenient for us, because it meant we could do things for a lot of startups at once, but being part of a batch was better for the startups too. It solved one of the biggest problems faced by founders: the isolation. Now you not only had colleagues, but colleagues who understood the problems you were facing and could tell you how they were solving them.
|
||||
|
||||
As YC grew, we started to notice other advantages of scale. The alumni became a tight community, dedicated to helping one another, and especially the current batch, whose shoes they remembered being in. We also noticed that the startups were becoming one another's customers. We used to refer jokingly to the "YC GDP," but as YC grows this becomes less and less of a joke. Now lots of startups get their initial set of customers almost entirely from among their batchmates.
|
||||
|
||||
I had not originally intended YC to be a full-time job. I was going to do three things: hack, write essays, and work on YC. As YC grew, and I grew more excited about it, it started to take up a lot more than a third of my attention. But for the first few years I was still able to work on other things.
|
||||
|
||||
In the summer of 2006, Robert and I started working on a new version of Arc. This one was reasonably fast, because it was compiled into Scheme. To test this new Arc, I wrote Hacker News in it. It was originally meant to be a news aggregator for startup founders and was called Startup News, but after a few months I got tired of reading about nothing but startups. Plus it wasn't startup founders we wanted to reach. It was future startup founders. So I changed the name to Hacker News and the topic to whatever engaged one's intellectual curiosity.
|
||||
|
||||
HN was no doubt good for YC, but it was also by far the biggest source of stress for me. If all I'd had to do was select and help founders, life would have been so easy. And that implies that HN was a mistake. Surely the biggest source of stress in one's work should at least be something close to the core of the work. Whereas I was like someone who was in pain while running a marathon not from the exertion of running, but because I had a blister from an ill-fitting shoe. When I was dealing with some urgent problem during YC, there was about a 60% chance it had to do with HN, and a 40% chance it had do with everything else combined. [17]
|
||||
|
||||
As well as HN, I wrote all of YC's internal software in Arc. But while I continued to work a good deal _in_ Arc, I gradually stopped working _on_ Arc, partly because I didn't have time to, and partly because it was a lot less attractive to mess around with the language now that we had all this infrastructure depending on it. So now my three projects were reduced to two: writing essays and working on YC.
|
||||
|
||||
YC was different from other kinds of work I've done. Instead of deciding for myself what to work on, the problems came to me. Every 6 months there was a new batch of startups, and their problems, whatever they were, became our problems. It was very engaging work, because their problems were quite varied, and the good founders were very effective. If you were trying to learn the most you could about startups in the shortest possible time, you couldn't have picked a better way to do it.
|
||||
|
||||
There were parts of the job I didn't like. Disputes between cofounders, figuring out when people were lying to us, fighting with people who maltreated the startups, and so on. But I worked hard even at the parts I didn't like. I was haunted by something Kevin Hale once said about companies: "No one works harder than the boss." He meant it both descriptively and prescriptively, and it was the second part that scared me. I wanted YC to be good, so if how hard I worked set the upper bound on how hard everyone else worked, I'd better work very hard.
|
||||
|
||||
One day in 2010, when he was visiting California for interviews, Robert Morris did something astonishing: he offered me unsolicited advice. I can only remember him doing that once before. One day at Viaweb, when I was bent over double from a kidney stone, he suggested that it would be a good idea for him to take me to the hospital. That was what it took for Rtm to offer unsolicited advice. So I remember his exact words very clearly. "You know," he said, "you should make sure Y Combinator isn't the last cool thing you do."
|
||||
|
||||
At the time I didn't understand what he meant, but gradually it dawned on me that he was saying I should quit. This seemed strange advice, because YC was doing great. But if there was one thing rarer than Rtm offering advice, it was Rtm being wrong. So this set me thinking. It was true that on my current trajectory, YC would be the last thing I did, because it was only taking up more of my attention. It had already eaten Arc, and was in the process of eating essays too. Either YC was my life's work or I'd have to leave eventually. And it wasn't, so I would.
|
||||
|
||||
In the summer of 2012 my mother had a stroke, and the cause turned out to be a blood clot caused by colon cancer. The stroke destroyed her balance, and she was put in a nursing home, but she really wanted to get out of it and back to her house, and my sister and I were determined to help her do it. I used to fly up to Oregon to visit her regularly, and I had a lot of time to think on those flights. On one of them I realized I was ready to hand YC over to someone else.
|
||||
|
||||
I asked Jessica if she wanted to be president, but she didn't, so we decided we'd try to recruit Sam Altman. We talked to Robert and Trevor and we agreed to make it a complete changing of the guard. Up till that point YC had been controlled by the original LLC we four had started. But we wanted YC to last for a long time, and to do that it couldn't be controlled by the founders. So if Sam said yes, we'd let him reorganize YC. Robert and I would retire, and Jessica and Trevor would become ordinary partners.
|
||||
|
||||
When we asked Sam if he wanted to be president of YC, initially he said no. He wanted to start a startup to make nuclear reactors. But I kept at it, and in October 2013 he finally agreed. We decided he'd take over starting with the winter 2014 batch. For the rest of 2013 I left running YC more and more to Sam, partly so he could learn the job, and partly because I was focused on my mother, whose cancer had returned.
|
||||
|
||||
She died on January 15, 2014. We knew this was coming, but it was still hard when it did.
|
||||
|
||||
I kept working on YC till March, to help get that batch of startups through Demo Day, then I checked out pretty completely. (I still talk to alumni and to new startups working on things I'm interested in, but that only takes a few hours a week.)
|
||||
|
||||
What should I do next? Rtm's advice hadn't included anything about that. I wanted to do something completely different, so I decided I'd paint. I wanted to see how good I could get if I really focused on it. So the day after I stopped working on YC, I started painting. I was rusty and it took a while to get back into shape, but it was at least completely engaging. [18]
|
||||
|
||||
I spent most of the rest of 2014 painting. I'd never been able to work so uninterruptedly before, and I got to be better than I had been. Not good enough, but better. Then in November, right in the middle of a painting, I ran out of steam. Up till that point I'd always been curious to see how the painting I was working on would turn out, but suddenly finishing this one seemed like a chore. So I stopped working on it and cleaned my brushes and haven't painted since. So far anyway.
|
||||
|
||||
I realize that sounds rather wimpy. But attention is a zero sum game. If you can choose what to work on, and you choose a project that's not the best one (or at least a good one) for you, then it's getting in the way of another project that is. And at 50 there was some opportunity cost to screwing around.
|
||||
|
||||
I started writing essays again, and wrote a bunch of new ones over the next few months. I even wrote a couple that [_weren't_](know.html) about startups. Then in March 2015 I started working on Lisp again.
|
||||
|
||||
The distinctive thing about Lisp is that its core is a language defined by writing an interpreter in itself. It wasn't originally intended as a programming language in the ordinary sense. It was meant to be a formal model of computation, an alternative to the Turing machine. If you want to write an interpreter for a language in itself, what's the minimum set of predefined operators you need? The Lisp that John McCarthy invented, or more accurately discovered, is an answer to that question. [19]
|
||||
|
||||
McCarthy didn't realize this Lisp could even be used to program computers till his grad student Steve Russell suggested it. Russell translated McCarthy's interpreter into IBM 704 machine language, and from that point Lisp started also to be a programming language in the ordinary sense. But its origins as a model of computation gave it a power and elegance that other languages couldn't match. It was this that attracted me in college, though I didn't understand why at the time.
|
||||
|
||||
McCarthy's 1960 Lisp did nothing more than interpret Lisp expressions. It was missing a lot of things you'd want in a programming language. So these had to be added, and when they were, they weren't defined using McCarthy's original axiomatic approach. That wouldn't have been feasible at the time. McCarthy tested his interpreter by hand-simulating the execution of programs. But it was already getting close to the limit of interpreters you could test that way indeed, there was a bug in it that McCarthy had overlooked. To test a more complicated interpreter, you'd have had to run it, and computers then weren't powerful enough.
|
||||
|
||||
Now they are, though. Now you could continue using McCarthy's axiomatic approach till you'd defined a complete programming language. And as long as every change you made to McCarthy's Lisp was a discoveredness-preserving transformation, you could, in principle, end up with a complete language that had this quality. Harder to do than to talk about, of course, but if it was possible in principle, why not try? So I decided to take a shot at it. It took 4 years, from March 26, 2015 to October 12, 2019. It was fortunate that I had a precisely defined goal, or it would have been hard to keep at it for so long.
|
||||
|
||||
I wrote this new Lisp, called [_Bel_](bel.html), in itself in Arc. That may sound like a contradiction, but it's an indication of the sort of trickery I had to engage in to make this work. By means of an egregious collection of hacks I managed to make something close enough to an interpreter written in itself that could actually run. Not fast, but fast enough to test.
|
||||
|
||||
I had to ban myself from writing essays during most of this time, or I'd never have finished. In late 2015 I spent 3 months writing essays, and when I went back to working on Bel I could barely understand the code. Not so much because it was badly written as because the problem is so convoluted. When you're working on an interpreter written in itself, it's hard to keep track of what's happening at what level, and errors can be practically encrypted by the time you get them.
|
||||
|
||||
So I said no more essays till Bel was done. But I told few people about Bel while I was working on it. So for years it must have seemed that I was doing nothing, when in fact I was working harder than I'd ever worked on anything. Occasionally after wrestling for hours with some gruesome bug I'd check Twitter or HN and see someone asking "Does Paul Graham still code?"
|
||||
|
||||
Working on Bel was hard but satisfying. I worked on it so intensively that at any given time I had a decent chunk of the code in my head and could write more there. I remember taking the boys to the coast on a sunny day in 2015 and figuring out how to deal with some problem involving continuations while I watched them play in the tide pools. It felt like I was doing life right. I remember that because I was slightly dismayed at how novel it felt. The good news is that I had more moments like this over the next few years.
|
||||
|
||||
In the summer of 2016 we moved to England. We wanted our kids to see what it was like living in another country, and since I was a British citizen by birth, that seemed the obvious choice. We only meant to stay for a year, but we liked it so much that we still live there. So most of Bel was written in England.
|
||||
|
||||
In the fall of 2019, Bel was finally finished. Like McCarthy's original Lisp, it's a spec rather than an implementation, although like McCarthy's Lisp it's a spec expressed as code.
|
||||
|
||||
Now that I could write essays again, I wrote a bunch about topics I'd had stacked up. I kept writing essays through 2020, but I also started to think about other things I could work on. How should I choose what to do? Well, how had I chosen what to work on in the past? I wrote an essay for myself to answer that question, and I was surprised how long and messy the answer turned out to be. If this surprised me, who'd lived it, then I thought perhaps it would be interesting to other people, and encouraging to those with similarly messy lives. So I wrote a more detailed version for others to read, and this is the last sentence of it.
|
||||
|
||||
|
||||
---
|
||||
**Notes**
|
||||
|
||||
[1] My experience skipped a step in the evolution of computers: time-sharing machines with interactive OSes. I went straight from batch processing to microcomputers, which made microcomputers seem all the more exciting.
|
||||
|
||||
[2] Italian words for abstract concepts can nearly always be predicted from their English cognates (except for occasional traps like _polluzione_ ). It's the everyday words that differ. So if you string together a lot of abstract concepts with a few simple verbs, you can make a little Italian go a long way.
|
||||
|
||||
[3] I lived at Piazza San Felice 4, so my walk to the Accademia went straight down the spine of old Florence: past the Pitti, across the bridge, past Orsanmichele, between the Duomo and the Baptistery, and then up Via Ricasoli to Piazza San Marco. I saw Florence at street level in every possible condition, from empty dark winter evenings to sweltering summer days when the streets were packed with tourists.
|
||||
|
||||
[4] You can of course paint people like still lives if you want to, and they're willing. That sort of portrait is arguably the apex of still life painting, though the long sitting does tend to produce pained expressions in the sitters.
|
||||
|
||||
[5] Interleaf was one of many companies that had smart people and built impressive technology, and yet got crushed by Moore's Law. In the 1990s the exponential growth in the power of commodity (i.e. Intel) processors rolled up high-end, special-purpose hardware and software companies like a bulldozer.
|
||||
|
||||
[6] The signature style seekers at RISD weren't specifically mercenary. In the art world, money and coolness are tightly coupled. Anything expensive comes to be seen as cool, and anything seen as cool will soon become equally expensive.
|
||||
|
||||
[7] Technically the apartment wasn't rent-controlled but rent-stabilized, but this is a refinement only New Yorkers would know or care about. The point is that it was really cheap, less than half market price.
|
||||
|
||||
[8] Most software you can launch as soon as it's done. But when the software is an online store builder and you're hosting the stores, if you don't have any users yet, that fact will be painfully obvious. So before we could launch publicly we had to launch privately, in the sense of recruiting an initial set of users and making sure they had decent-looking stores.
|
||||
|
||||
[9] We'd had a code editor in Viaweb for users to define their own page styles. They didn't know it, but they were editing Lisp expressions underneath. But this wasn't an app editor, because the code ran when the merchants' sites were generated, not when shoppers visited them.
|
||||
|
||||
[10] This was the first instance of what is now a familiar experience, and so was what happened next, when I read the comments and found they were full of angry people. How could I claim that Lisp was better than other languages? Weren't they all Turing complete? People who see the responses to essays I write sometimes tell me how sorry they feel for me, but I'm not exaggerating when I reply that it has always been like this, since the very beginning. It comes with the territory. An essay must tell readers things they [_don't already know_](useful.html), and some people dislike being told such things.
|
||||
|
||||
[11] People put plenty of stuff on the internet in the 90s of course, but putting something online is not the same as publishing it online. Publishing online means you treat the online version as the (or at least a) primary version.
|
||||
|
||||
[12] There is a general lesson here that our experience with Y Combinator also teaches: Customs continue to constrain you long after the restrictions that caused them have disappeared. Customary VC practice had once, like the customs about publishing essays, been based on real constraints. Startups had once been much more expensive to start, and proportionally rare. Now they could be cheap and common, but the VCs' customs still reflected the old world, just as customs about writing essays still reflected the constraints of the print era.
|
||||
|
||||
Which in turn implies that people who are independent-minded (i.e. less influenced by custom) will have an advantage in fields affected by rapid change (where customs are more likely to be obsolete).
|
||||
|
||||
Here's an interesting point, though: you can't always predict which fields will be affected by rapid change. Obviously software and venture capital will be, but who would have predicted that essay writing would be?
|
||||
|
||||
[13] Y Combinator was not the original name. At first we were called Cambridge Seed. But we didn't want a regional name, in case someone copied us in Silicon Valley, so we renamed ourselves after one of the coolest tricks in the lambda calculus, the Y combinator.
|
||||
|
||||
I picked orange as our color partly because it's the warmest, and partly because no VC used it. In 2005 all the VCs used staid colors like maroon, navy blue, and forest green, because they were trying to appeal to LPs, not founders. The YC logo itself is an inside joke: the Viaweb logo had been a white V on a red circle, so I made the YC logo a white Y on an orange square.
|
||||
|
||||
[14] YC did become a fund for a couple years starting in 2009, because it was getting so big I could no longer afford to fund it personally. But after Heroku got bought we had enough money to go back to being self-funded.
|
||||
|
||||
[15] I've never liked the term "deal flow," because it implies that the number of new startups at any given time is fixed. This is not only false, but it's the purpose of YC to falsify it, by causing startups to be founded that would not otherwise have existed.
|
||||
|
||||
[16] She reports that they were all different shapes and sizes, because there was a run on air conditioners and she had to get whatever she could, but that they were all heavier than she could carry now.
|
||||
|
||||
[17] Another problem with HN was a bizarre edge case that occurs when you both write essays and run a forum. When you run a forum, you're assumed to see if not every conversation, at least every conversation involving you. And when you write essays, people post highly imaginative misinterpretations of them on forums. Individually these two phenomena are tedious but bearable, but the combination is disastrous. You actually have to respond to the misinterpretations, because the assumption that you're present in the conversation means that not responding to any sufficiently upvoted misinterpretation reads as a tacit admission that it's correct. But that in turn encourages more; anyone who wants to pick a fight with you senses that now is their chance.
|
||||
|
||||
[18] The worst thing about leaving YC was not working with Jessica anymore. We'd been working on YC almost the whole time we'd known each other, and we'd neither tried nor wanted to separate it from our personal lives, so leaving was like pulling up a deeply rooted tree.
|
||||
|
||||
[19] One way to get more precise about the concept of invented vs discovered is to talk about space aliens. Any sufficiently advanced alien civilization would certainly know about the Pythagorean theorem, for example. I believe, though with less certainty, that they would also know about the Lisp in McCarthy's 1960 paper.
|
||||
|
||||
But if so there's no reason to suppose that this is the limit of the language that might be known to them. Presumably aliens need numbers and errors and I/O too. So it seems likely there exists at least one path out of McCarthy's Lisp along which discoveredness is preserved.
|
||||
|
||||
|
||||
---
|
||||
**Thanks** to Trevor Blackwell, John Collison, Patrick Collison, Daniel Gackle, Ralph Hazell, Jessica Livingston, Robert Morris, and Harj Taggar for reading drafts of this.
|
||||
* * *
|
||||
19
tests/data/markdown/why_yc.md
Normal file
19
tests/data/markdown/why_yc.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# 057 Why YC
|
||||
|
||||
|
||||
|
||||
March 2006, rev August 2009
|
||||
|
||||
Yesterday one of the founders we funded asked me why we started [Y Combinator](http://ycombinator.com). Or more precisely, he asked if we'd started YC mainly for fun.
|
||||
|
||||
Kind of, but not quite. It is enormously fun to be able to work with Rtm and Trevor again. I missed that after we sold Viaweb, and for all the years after I always had a background process running, looking for something we could do together. There is definitely an aspect of a band reunion to Y Combinator. Every couple days I slip and call it "Viaweb."
|
||||
|
||||
Viaweb we started very explicitly to make money. I was sick of living from one freelance project to the next, and decided to just work as hard as I could till I'd made enough to solve the problem once and for all. Viaweb was sometimes fun, but it wasn't designed for fun, and mostly it wasn't. I'd be surprised if any startup is. All startups are mostly schleps.
|
||||
|
||||
The real reason we started Y Combinator is neither selfish nor virtuous. We didn't start it mainly to make money; we have no idea what our average returns might be, and won't know for years. Nor did we start YC mainly to help out young would-be founders, though we do like the idea, and comfort ourselves occasionally with the thought that if all our investments tank, we will thus have been doing something unselfish. (It's oddly nondeterministic.)
|
||||
|
||||
The real reason we started Y Combinator is one probably only a [hacker](gba.html) would understand. We did it because it seems such a great hack. There are thousands of smart people who could start companies and don't, and with a relatively small amount of force applied at just the right place, we can spring on the world a stream of new startups that might otherwise not have existed.
|
||||
|
||||
In a way this is virtuous, because I think startups are a good thing. But really what motivates us is the completely amoral desire that would motivate any hacker who looked at some complex device and realized that with a tiny tweak he could make it run more efficiently. In this case, the device is the world's economy, which fortunately happens to be open source.
|
||||
|
||||
* * *
|
||||
430
tests/test_chat_actors.py
Normal file
430
tests/test_chat_actors.py
Normal file
@@ -0,0 +1,430 @@
|
||||
# Standard Packages
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
# External Packages
|
||||
import pytest
|
||||
from freezegun import freeze_time
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.conversation.gpt import converse, extract_questions
|
||||
from khoj.processor.conversation.utils import message_to_log
|
||||
|
||||
|
||||
# Initialize variables for tests
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if api_key is None:
|
||||
pytest.skip(
|
||||
reason="Set OPENAI_API_KEY environment variable to run tests below. Get OpenAI API key from https://platform.openai.com/account/api-keys",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
@freeze_time("1984-04-02")
|
||||
def test_extract_question_with_date_filter_from_relative_day():
|
||||
# Act
|
||||
response = extract_questions("Where did I go for dinner yesterday?")
|
||||
|
||||
# Assert
|
||||
expected_responses = [
|
||||
('dt="1984-04-01"', ""),
|
||||
('dt>="1984-04-01"', 'dt<"1984-04-02"'),
|
||||
('dt>"1984-03-31"', 'dt<"1984-04-02"'),
|
||||
]
|
||||
assert len(response) == 1
|
||||
assert any([start in response[0] and end in response[0] for start, end in expected_responses]), (
|
||||
"Expected date filter to limit to 1st April 1984 in response but got: " + response[0]
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
@freeze_time("1984-04-02")
|
||||
def test_extract_question_with_date_filter_from_relative_month():
|
||||
# Act
|
||||
response = extract_questions("Which countries did I visit last month?")
|
||||
|
||||
# Assert
|
||||
expected_responses = [('dt>="1984-03-01"', 'dt<"1984-04-01"'), ('dt>="1984-03-01"', 'dt<="1984-03-31"')]
|
||||
assert len(response) == 1
|
||||
assert any([start in response[0] and end in response[0] for start, end in expected_responses]), (
|
||||
"Expected date filter to limit to March 1984 in response but got: " + response[0]
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
@freeze_time("1984-04-02")
|
||||
def test_extract_question_with_date_filter_from_relative_year():
|
||||
# Act
|
||||
response = extract_questions("Which countries have I visited this year?")
|
||||
|
||||
# Assert
|
||||
expected_responses = [
|
||||
('dt>="1984-01-01"', ""),
|
||||
('dt>="1984-01-01"', 'dt<"1985-01-01"'),
|
||||
('dt>="1984-01-01"', 'dt<="1984-12-31"'),
|
||||
]
|
||||
assert len(response) == 1
|
||||
assert any([start in response[0] and end in response[0] for start, end in expected_responses]), (
|
||||
"Expected date filter to limit to 1984 in response but got: " + response[0]
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_extract_multiple_explicit_questions_from_message():
|
||||
# Act
|
||||
response = extract_questions("What is the Sun? What is the Moon?")
|
||||
|
||||
# Assert
|
||||
expected_responses = [
|
||||
("sun", "moon"),
|
||||
]
|
||||
assert len(response) == 2
|
||||
assert any([start in response[0].lower() and end in response[1].lower() for start, end in expected_responses]), (
|
||||
"Expected two search queries in response but got: " + response[0]
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_extract_multiple_implicit_questions_from_message():
|
||||
# Act
|
||||
response = extract_questions("Is Morpheus taller than Neo?")
|
||||
|
||||
# Assert
|
||||
expected_responses = [
|
||||
("morpheus", "neo"),
|
||||
]
|
||||
assert len(response) == 2
|
||||
assert any([start in response[0].lower() and end in response[1].lower() for start, end in expected_responses]), (
|
||||
"Expected two search queries in response but got: " + response[0]
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_generate_search_query_using_question_from_chat_history():
|
||||
# Arrange
|
||||
message_list = [
|
||||
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = extract_questions("Does he have any sons?", conversation_log=populate_chat_history(message_list))
|
||||
|
||||
# Assert
|
||||
assert len(response) == 1
|
||||
assert "Vader" in response[0]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_generate_search_query_using_answer_from_chat_history():
|
||||
# Arrange
|
||||
message_list = [
|
||||
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = extract_questions("Is she a Jedi?", conversation_log=populate_chat_history(message_list))
|
||||
|
||||
# Assert
|
||||
assert len(response) == 1
|
||||
assert "Leia" in response[0]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_generate_search_query_using_question_and_answer_from_chat_history():
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Does Luke Skywalker have any Siblings?", "Yes, Princess Leia", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = extract_questions("Who is their father?", conversation_log=populate_chat_history(message_list))
|
||||
|
||||
# Assert
|
||||
assert len(response) == 1
|
||||
assert "Leia" in response[0] and "Luke" in response[0]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_generate_search_query_with_date_and_context_from_chat_history():
|
||||
# Arrange
|
||||
message_list = [
|
||||
("When did I visit Masai Mara?", "You visited Masai Mara in April 2000", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = extract_questions(
|
||||
"What was the Pizza place we ate at over there?", conversation_log=populate_chat_history(message_list)
|
||||
)
|
||||
|
||||
# Assert
|
||||
expected_responses = [
|
||||
('dt>="2000-04-01"', 'dt<"2000-05-01"'),
|
||||
('dt>="2000-04-01"', 'dt<="2000-04-31"'),
|
||||
]
|
||||
assert len(response) == 1
|
||||
assert "Masai Mara" in response[0]
|
||||
assert any([start in response[0] and end in response[0] for start, end in expected_responses]), (
|
||||
"Expected date filter to limit to April 2000 in response but got: " + response[0]
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_chat_with_no_chat_history_or_retrieved_content():
|
||||
# Act
|
||||
response = converse(
|
||||
references=[], # Assume no context retrieved from notes for the user_query
|
||||
user_query="Hello, my name is Testatron. Who are you?",
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
expected_responses = ["Khoj", "khoj"]
|
||||
assert len(response) > 0
|
||||
assert any([expected_response in response for expected_response in expected_responses]), (
|
||||
"Expected assistants name, [K|k]hoj, in response but got: " + response
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_from_chat_history_and_no_content():
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=[], # Assume no context retrieved from notes for the user_query
|
||||
user_query="What is my name?",
|
||||
conversation_log=populate_chat_history(message_list),
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
expected_responses = ["Testatron", "testatron"]
|
||||
assert len(response) > 0
|
||||
assert any([expected_response in response for expected_response in expected_responses]), (
|
||||
"Expected [T|t]estatron in response but got: " + response
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_from_chat_history_and_previously_retrieved_content():
|
||||
"Chat actor needs to use context in previous notes and chat history to answer question"
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
(
|
||||
"When was I born?",
|
||||
"You were born on 1st April 1984.",
|
||||
["Testatron was born on 1st April 1984 in Testville."],
|
||||
),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=[], # Assume no context retrieved from notes for the user_query
|
||||
user_query="Where was I born?",
|
||||
conversation_log=populate_chat_history(message_list),
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(response) > 0
|
||||
# Infer who I am and use that to infer I was born in Testville using chat history and previously retrieved notes
|
||||
assert "Testville" in response
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_from_chat_history_and_currently_retrieved_content():
|
||||
"Chat actor needs to use context across currently retrieved notes and chat history to answer question"
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=[
|
||||
"Testatron was born on 1st April 1984 in Testville."
|
||||
], # Assume context retrieved from notes for the user_query
|
||||
user_query="Where was I born?",
|
||||
conversation_log=populate_chat_history(message_list),
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(response) > 0
|
||||
assert "Testville" in response
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_no_answer_in_chat_history_or_retrieved_content():
|
||||
"Chat actor should say don't know as not enough contexts in chat history or retrieved to answer question"
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=[], # Assume no context retrieved from notes for the user_query
|
||||
user_query="Where was I born?",
|
||||
conversation_log=populate_chat_history(message_list),
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
expected_responses = ["don't know", "do not know", "no information", "do not have", "don't have"]
|
||||
assert len(response) > 0
|
||||
assert any([expected_response in response for expected_response in expected_responses]), (
|
||||
"Expected chat actor to say they don't know in response, but got: " + response
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_requires_current_date_awareness():
|
||||
"Chat actor should be able to answer questions relative to current date using provided notes"
|
||||
# Arrange
|
||||
context = [
|
||||
f"""{datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner"
|
||||
Expenses:Food:Dining 10.00 USD""",
|
||||
f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch"
|
||||
Expenses:Food:Dining 10.00 USD""",
|
||||
f"""2020-04-01 "SuperMercado" "Bananas"
|
||||
Expenses:Food:Groceries 10.00 USD""",
|
||||
f"""2020-01-01 "Naco Taco" "Burittos for Dinner"
|
||||
Expenses:Food:Dining 10.00 USD""",
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=context, # Assume context retrieved from notes for the user_query
|
||||
user_query="What did I have for Dinner today?",
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
expected_responses = ["tacos", "Tacos"]
|
||||
assert len(response) > 0
|
||||
assert any([expected_response in response for expected_response in expected_responses]), (
|
||||
"Expected [T|t]acos in response, but got: " + response
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_requires_date_aware_aggregation_across_provided_notes():
|
||||
"Chat actor should be able to answer questions that require date aware aggregation across multiple notes"
|
||||
# Arrange
|
||||
context = [
|
||||
f"""# {datetime.now().strftime("%Y-%m-%d")} "Naco Taco" "Tacos for Dinner"
|
||||
Expenses:Food:Dining 10.00 USD""",
|
||||
f"""{datetime.now().strftime("%Y-%m-%d")} "Sagar Ratna" "Dosa for Lunch"
|
||||
Expenses:Food:Dining 10.00 USD""",
|
||||
f"""2020-04-01 "SuperMercado" "Bananas"
|
||||
Expenses:Food:Groceries 10.00 USD""",
|
||||
f"""2020-01-01 "Naco Taco" "Burittos for Dinner"
|
||||
Expenses:Food:Dining 10.00 USD""",
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=context, # Assume context retrieved from notes for the user_query
|
||||
user_query="How much did I spend on dining this year?",
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(response) > 0
|
||||
assert "20" in response
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_general_question_not_in_chat_history_or_retrieved_content():
|
||||
"Chat actor should be able to answer general questions not requiring looking at chat history or notes"
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
("Where was I born?", "You were born Testville.", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=[], # Assume no context retrieved from notes for the user_query
|
||||
user_query="Write a haiku about unit testing in 3 lines",
|
||||
conversation_log=populate_chat_history(message_list),
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
expected_responses = ["test", "Test"]
|
||||
assert len(response.splitlines()) == 3 # haikus are 3 lines long
|
||||
assert any([expected_response in response for expected_response in expected_responses]), (
|
||||
"Expected [T|t]est in response, but got: " + response
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.xfail(reason="Chat actor not consistently capable of asking for clarification yet.")
|
||||
@pytest.mark.chatquality
|
||||
def test_ask_for_clarification_if_not_enough_context_in_question():
|
||||
"Chat actor should ask for clarification if question cannot be answered unambiguously with the provided context"
|
||||
# Arrange
|
||||
context = [
|
||||
f"""# Ramya
|
||||
My sister, Ramya, is married to Kali Devi. They have 2 kids, Ravi and Rani.""",
|
||||
f"""# Fang
|
||||
My sister, Fang Liu is married to Xi Li. They have 1 kid, Xiao Li.""",
|
||||
f"""# Aiyla
|
||||
My sister, Aiyla is married to Tolga. They have 3 kids, Yildiz, Ali and Ahmet.""",
|
||||
]
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
references=context, # Assume context retrieved from notes for the user_query
|
||||
user_query="How many kids does my older sister have?",
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
# Assert
|
||||
expected_responses = ["which sister", "Which sister", "which of your sister", "Which of your sister"]
|
||||
assert any([expected_response in response for expected_response in expected_responses]), (
|
||||
"Expected chat actor to ask for clarification in response, but got: " + response
|
||||
)
|
||||
|
||||
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def populate_chat_history(message_list):
|
||||
# Generate conversation logs
|
||||
conversation_log = {"chat": []}
|
||||
for user_message, gpt_message, context in message_list:
|
||||
conversation_log["chat"] += message_to_log(
|
||||
user_message,
|
||||
gpt_message,
|
||||
{"context": context, "intent": {"query": user_message, "inferred-queries": f'["{user_message}"]'}},
|
||||
)
|
||||
return conversation_log
|
||||
285
tests/test_chat_director.py
Normal file
285
tests/test_chat_director.py
Normal file
@@ -0,0 +1,285 @@
|
||||
# Standard Packages
|
||||
import os
|
||||
|
||||
# External Packages
|
||||
import pytest
|
||||
from freezegun import freeze_time
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.conversation.utils import message_to_log
|
||||
from khoj.utils import state
|
||||
|
||||
|
||||
# Initialize variables for tests
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if api_key is None:
|
||||
pytest.skip(
|
||||
reason="Set OPENAI_API_KEY environment variable to run tests below. Get OpenAI API key from https://platform.openai.com/account/api-keys",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def populate_chat_history(message_list):
|
||||
# Generate conversation logs
|
||||
conversation_log = {"chat": []}
|
||||
for user_message, gpt_message, context in message_list:
|
||||
conversation_log["chat"] += message_to_log(
|
||||
user_message,
|
||||
gpt_message,
|
||||
{"context": context, "intent": {"query": user_message, "inferred-queries": f'["{user_message}"]'}},
|
||||
)
|
||||
|
||||
# Update Conversation Metadata Logs in Application State
|
||||
state.processor_config.conversation.meta_log = conversation_log
|
||||
|
||||
|
||||
# Tests
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_chat_with_no_chat_history_or_retrieved_content(chat_client):
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="Hello, my name is Testatron. Who are you?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["Khoj", "khoj"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected assistants name, [K|k]hoj, in response but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_from_chat_history(chat_client):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
]
|
||||
populate_chat_history(message_list)
|
||||
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="What is my name?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["Testatron", "testatron"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected [T|t]estatron in response but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_from_currently_retrieved_content(chat_client):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
(
|
||||
"When was I born?",
|
||||
"You were born on 1st April 1984.",
|
||||
["Testatron was born on 1st April 1984 in Testville."],
|
||||
),
|
||||
]
|
||||
populate_chat_history(message_list)
|
||||
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="Where was Xi Li born?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert "Fujiang" in response_message
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_from_chat_history_and_previously_retrieved_content(chat_client):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
(
|
||||
"When was I born?",
|
||||
"You were born on 1st April 1984.",
|
||||
["Testatron was born on 1st April 1984 in Testville."],
|
||||
),
|
||||
]
|
||||
populate_chat_history(message_list)
|
||||
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="Where was I born?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
# 1. Infer who I am from chat history
|
||||
# 2. Infer I was born in Testville from previously retrieved notes
|
||||
assert "Testville" in response_message
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering this question yet")
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_from_chat_history_and_currently_retrieved_content(chat_client):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Xi Li. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
]
|
||||
populate_chat_history(message_list)
|
||||
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="Where was I born?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
# Inference in a multi-turn conversation
|
||||
# 1. Infer who I am from chat history
|
||||
# 2. Search for notes about when <my_name_from_chat_history> was born
|
||||
# 3. Extract where I was born from currently retrieved notes
|
||||
assert "Fujiang" in response_message
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_no_answer_in_chat_history_or_retrieved_content(chat_client):
|
||||
"Chat director should say don't know as not enough contexts in chat history or retrieved to answer question"
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
]
|
||||
populate_chat_history(message_list)
|
||||
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="Where was I born?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["don't know", "do not know", "no information", "do not have", "don't have"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected chat director to say they don't know in response, but got: " + response
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.xfail(AssertionError, reason="Chat director not capable of answering time aware questions yet")
|
||||
@pytest.mark.chatquality
|
||||
@freeze_time("2023-04-01")
|
||||
def test_answer_requires_current_date_awareness(chat_client):
|
||||
"Chat actor should be able to answer questions relative to current date using provided notes"
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="Where did I have lunch today?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["Arak", "Medellin"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected chat director to say Arak, Medellin, but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
@freeze_time("2023-04-01")
|
||||
def test_answer_requires_date_aware_aggregation_across_provided_notes(chat_client):
|
||||
"Chat director should be able to answer questions that require date aware aggregation across multiple notes"
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="How much did I spend on dining this year?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert "23" in response_message
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_general_question_not_in_chat_history_or_retrieved_content(chat_client):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
("Where was I born?", "You were born Testville.", []),
|
||||
]
|
||||
populate_chat_history(message_list)
|
||||
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q=""Write a haiku about unit testing"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["test", "Test"]
|
||||
assert response.status_code == 200
|
||||
assert len(response_message.splitlines()) == 3 # haikus are 3 lines long
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected [T|t]est in response, but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.xfail(reason="Chat director not consistently capable of asking for clarification yet.")
|
||||
@pytest.mark.chatquality
|
||||
def test_ask_for_clarification_if_not_enough_context_in_question(chat_client):
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="What is the name of Namitas older son"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = [
|
||||
"which of them is the older",
|
||||
"which one is older",
|
||||
"which of them is older",
|
||||
"which one is the older",
|
||||
]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message.lower() for expected_response in expected_responses]), (
|
||||
"Expected chat director to ask for clarification in response, but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.xfail(reason="Chat director not capable of answering this question yet")
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_in_chat_history_beyond_lookback_window(chat_client):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("Hello, my name is Testatron. Who are you?", "Hi, I am Khoj, a personal assistant. How can I help?", []),
|
||||
("When was I born?", "You were born on 1st April 1984.", []),
|
||||
("Where was I born?", "You were born Testville.", []),
|
||||
]
|
||||
populate_chat_history(message_list)
|
||||
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="What is my name?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["Testatron", "testatron"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message.lower() for expected_response in expected_responses]), (
|
||||
"Expected [T|t]estatron in response, but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.chatquality
|
||||
def test_answer_requires_multiple_independent_searches(chat_client):
|
||||
"Chat director should be able to answer by doing multiple independent searches for required information"
|
||||
# Act
|
||||
response = chat_client.get(f'/api/chat?q="Is Xi older than Namita?"')
|
||||
response_message = response.json()["response"]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["he is older than namita", "xi is older than namita"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message.lower() for expected_response in expected_responses]), (
|
||||
"Expected Xi is older than Namita, but got: " + response_message
|
||||
)
|
||||
@@ -1,81 +0,0 @@
|
||||
# External Packages
|
||||
import pytest
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.conversation.gpt import converse, understand, message_to_prompt
|
||||
|
||||
|
||||
# Initialize variables for tests
|
||||
model = "text-davinci-003"
|
||||
api_key = None # Input your OpenAI API key to run the tests below
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_message_to_understand_prompt():
|
||||
# Arrange
|
||||
understand_primer = 'Extract information from each chat message\n\nremember(memory-type, data);\nmemory-type=["companion", "notes", "ledger", "image", "music"]\nsearch(search-type, data);\nsearch-type=["google", "youtube"]\ngenerate(activity);\nactivity=["paint","write", "chat"]\ntrigger-emotion(emotion);\nemotion=["happy","confidence","fear","surprise","sadness","disgust","anger", "curiosity", "calm"]\n\nQ: How are you doing?\nA: activity("chat"); trigger-emotion("surprise")\nQ: Do you remember what I told you about my brother Antoine when we were at the beach?\nA: remember("notes", "Brother Antoine when we were at the beach"); trigger-emotion("curiosity");\nQ: what did we talk about last time?\nA: remember("notes", "talk last time"); trigger-emotion("curiosity");\nQ: Let\'s make some drawings!\nA: generate("paint"); trigger-emotion("happy");\nQ: Do you know anything about Lebanon?\nA: search("google", "lebanon"); trigger-emotion("confidence");\nQ: Find a video about a panda rolling in the grass\nA: search("youtube","panda rolling in the grass"); trigger-emotion("happy"); \nQ: Tell me a scary story\nA: generate("write" "A story about some adventure"); trigger-emotion("fear");\nQ: What fiction book was I reading last week about AI starship?\nA: remember("notes", "read fiction book about AI starship last week"); trigger-emotion("curiosity");\nQ: How much did I spend at Subway for dinner last time?\nA: remember("ledger", "last Subway dinner"); trigger-emotion("curiosity");\nQ: I\'m feeling sleepy\nA: activity("chat"); trigger-emotion("calm")\nQ: What was that popular Sri lankan song that Alex showed me recently?\nA: remember("music", "popular Sri lankan song that Alex showed recently"); trigger-emotion("curiosity"); \nQ: You\'re pretty funny!\nA: activity("chat"); trigger-emotion("pride")'
|
||||
expected_response = 'Extract information from each chat message\n\nremember(memory-type, data);\nmemory-type=["companion", "notes", "ledger", "image", "music"]\nsearch(search-type, data);\nsearch-type=["google", "youtube"]\ngenerate(activity);\nactivity=["paint","write", "chat"]\ntrigger-emotion(emotion);\nemotion=["happy","confidence","fear","surprise","sadness","disgust","anger", "curiosity", "calm"]\n\nQ: How are you doing?\nA: activity("chat"); trigger-emotion("surprise")\nQ: Do you remember what I told you about my brother Antoine when we were at the beach?\nA: remember("notes", "Brother Antoine when we were at the beach"); trigger-emotion("curiosity");\nQ: what did we talk about last time?\nA: remember("notes", "talk last time"); trigger-emotion("curiosity");\nQ: Let\'s make some drawings!\nA: generate("paint"); trigger-emotion("happy");\nQ: Do you know anything about Lebanon?\nA: search("google", "lebanon"); trigger-emotion("confidence");\nQ: Find a video about a panda rolling in the grass\nA: search("youtube","panda rolling in the grass"); trigger-emotion("happy"); \nQ: Tell me a scary story\nA: generate("write" "A story about some adventure"); trigger-emotion("fear");\nQ: What fiction book was I reading last week about AI starship?\nA: remember("notes", "read fiction book about AI starship last week"); trigger-emotion("curiosity");\nQ: How much did I spend at Subway for dinner last time?\nA: remember("ledger", "last Subway dinner"); trigger-emotion("curiosity");\nQ: I\'m feeling sleepy\nA: activity("chat"); trigger-emotion("calm")\nQ: What was that popular Sri lankan song that Alex showed me recently?\nA: remember("music", "popular Sri lankan song that Alex showed recently"); trigger-emotion("curiosity"); \nQ: You\'re pretty funny!\nA: activity("chat"); trigger-emotion("pride")\nQ: When did I last dine at Burger King?\nA:'
|
||||
|
||||
# Act
|
||||
actual_response = message_to_prompt(
|
||||
"When did I last dine at Burger King?", understand_primer, start_sequence="\nA:", restart_sequence="\nQ:"
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert actual_response == expected_response
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
api_key is None, reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys"
|
||||
)
|
||||
def test_minimal_chat_with_gpt():
|
||||
# Act
|
||||
response = converse("What will happen when the stars go out?", model=model, api_key=api_key)
|
||||
|
||||
# Assert
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
api_key is None, reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys"
|
||||
)
|
||||
def test_chat_with_history():
|
||||
# Arrange
|
||||
ai_prompt = "AI:"
|
||||
human_prompt = "Human:"
|
||||
|
||||
conversation_primer = f"""
|
||||
The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly companion.
|
||||
|
||||
{human_prompt} Hello, I am Testatron. Who are you?
|
||||
{ai_prompt} Hi, I am Khoj, an AI conversational companion created by OpenAI. How can I help you today?"""
|
||||
|
||||
# Act
|
||||
response = converse(
|
||||
"Hi Khoj, What is my name?",
|
||||
model=model,
|
||||
conversation_history=conversation_primer,
|
||||
api_key=api_key,
|
||||
temperature=0,
|
||||
max_tokens=50,
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(response) > 0
|
||||
assert "Testatron" in response or "testatron" in response
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
api_key is None, reason="Set api_key variable to your OpenAI API key from https://beta.openai.com/account/api-keys"
|
||||
)
|
||||
def test_understand_message_using_gpt():
|
||||
# Act
|
||||
response = understand("When did I last dine at Subway?", model=model, api_key=api_key)
|
||||
|
||||
# Assert
|
||||
assert len(response) > 0
|
||||
assert response["intent"]["memory-type"] == "ledger"
|
||||
@@ -43,8 +43,21 @@ def test_cli_config_from_file():
|
||||
assert actual_args.no_gui == True
|
||||
assert actual_args.regenerate == True
|
||||
assert actual_args.config is not None
|
||||
assert actual_args.verbose == 3
|
||||
|
||||
# Ensure content config is loaded from file
|
||||
assert actual_args.config.content_type.org.input_files == [
|
||||
Path("~/first_from_config.org"),
|
||||
Path("~/second_from_config.org"),
|
||||
]
|
||||
assert actual_args.verbose == 3
|
||||
assert len(actual_args.config.content_type.plugins.keys()) == 2
|
||||
assert actual_args.config.content_type.plugins["content_plugin_1"].input_files == [
|
||||
Path("content_plugin_1_new.jsonl.gz")
|
||||
]
|
||||
assert actual_args.config.content_type.plugins["content_plugin_2"].input_filter == ["*2_new.jsonl.gz"]
|
||||
assert actual_args.config.content_type.plugins["content_plugin_1"].compressed_jsonl == Path(
|
||||
"content_plugin_1.jsonl.gz"
|
||||
)
|
||||
assert actual_args.config.content_type.plugins["content_plugin_2"].embeddings_file == Path(
|
||||
"content_plugin_2_embeddings.pt"
|
||||
)
|
||||
|
||||
@@ -9,6 +9,8 @@ from fastapi.testclient import TestClient
|
||||
|
||||
# Internal Packages
|
||||
from khoj.main import app
|
||||
from khoj.configure import configure_routes, configure_search_types
|
||||
from khoj.utils import state
|
||||
from khoj.utils.state import model, config
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
@@ -17,14 +19,9 @@ from khoj.search_filter.word_filter import WordFilter
|
||||
from khoj.search_filter.file_filter import FileFilter
|
||||
|
||||
|
||||
# Arrange
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_search_with_invalid_content_type():
|
||||
def test_search_with_invalid_content_type(client):
|
||||
# Arrange
|
||||
user_query = quote("How to call Khoj from Emacs?")
|
||||
|
||||
@@ -36,13 +33,8 @@ def test_search_with_invalid_content_type():
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_search_with_valid_content_type(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
config.content_type = content_config
|
||||
config.search_type = search_config
|
||||
|
||||
# config.content_type.image = search_config.image
|
||||
for content_type in ["org", "markdown", "ledger", "music"]:
|
||||
def test_search_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/search?q=random&t={content_type}")
|
||||
# Assert
|
||||
@@ -50,7 +42,7 @@ def test_search_with_valid_content_type(content_config: ContentConfig, search_co
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_with_invalid_content_type():
|
||||
def test_update_with_invalid_content_type(client):
|
||||
# Act
|
||||
response = client.get(f"/api/update?t=invalid_content_type")
|
||||
|
||||
@@ -59,12 +51,8 @@ def test_update_with_invalid_content_type():
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_with_valid_content_type(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
config.content_type = content_config
|
||||
config.search_type = search_config
|
||||
|
||||
for content_type in ["org", "markdown", "ledger", "music"]:
|
||||
def test_update_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?t={content_type}")
|
||||
# Assert
|
||||
@@ -72,7 +60,7 @@ def test_update_with_valid_content_type(content_config: ContentConfig, search_co
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_invalid_content_type():
|
||||
def test_regenerate_with_invalid_content_type(client):
|
||||
# Act
|
||||
response = client.get(f"/api/update?force=true&t=invalid_content_type")
|
||||
|
||||
@@ -81,12 +69,8 @@ def test_regenerate_with_invalid_content_type():
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_valid_content_type(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
config.content_type = content_config
|
||||
config.search_type = search_config
|
||||
|
||||
for content_type in ["org", "markdown", "ledger", "music", "image"]:
|
||||
def test_regenerate_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?force=true&t={content_type}")
|
||||
# Assert
|
||||
@@ -94,10 +78,71 @@ def test_regenerate_with_valid_content_type(content_config: ContentConfig, searc
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_get_configured_types_via_api(client):
|
||||
# Act
|
||||
response = client.get(f"/api/config/types")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert response.json() == ["org", "image", "plugin1"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_get_configured_types_with_only_plugin_content_config(content_config):
|
||||
# Arrange
|
||||
config.content_type = ContentConfig()
|
||||
config.content_type.plugins = content_config.plugins
|
||||
state.SearchType = configure_search_types(config)
|
||||
|
||||
configure_routes(app)
|
||||
client = TestClient(app)
|
||||
|
||||
# Act
|
||||
response = client.get(f"/api/config/types")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert response.json() == ["plugin1"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_get_configured_types_with_no_plugin_content_config(content_config):
|
||||
# Arrange
|
||||
config.content_type = content_config
|
||||
config.search_type = search_config
|
||||
config.content_type.plugins = None
|
||||
state.SearchType = configure_search_types(config)
|
||||
|
||||
configure_routes(app)
|
||||
client = TestClient(app)
|
||||
|
||||
# Act
|
||||
response = client.get(f"/api/config/types")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert "plugin1" not in response.json()
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_get_configured_types_with_no_content_config():
|
||||
# Arrange
|
||||
config.content_type = ContentConfig()
|
||||
state.SearchType = configure_search_types(config)
|
||||
|
||||
configure_routes(app)
|
||||
client = TestClient(app)
|
||||
|
||||
# Act
|
||||
response = client.get(f"/api/config/types")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
assert response.json() == []
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_search(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
|
||||
query_expected_image_pairs = [
|
||||
("kitten", "kitten_park.jpg"),
|
||||
@@ -119,7 +164,7 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
user_query = quote("How to git install application?")
|
||||
@@ -135,7 +180,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_only_filters(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search_with_only_filters(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter(), FileFilter()]
|
||||
model.orgmode_search = text_search.setup(
|
||||
@@ -154,7 +199,7 @@ def test_notes_search_with_only_filters(content_config: ContentConfig, search_co
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search_with_include_filter(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
model.orgmode_search = text_search.setup(
|
||||
@@ -173,7 +218,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||
def test_notes_search_with_exclude_filter(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
filters = [WordFilter()]
|
||||
model.orgmode_search = text_search.setup(
|
||||
|
||||
@@ -3,11 +3,15 @@ import re
|
||||
from datetime import datetime
|
||||
from math import inf
|
||||
|
||||
# Application Packages
|
||||
# External Packages
|
||||
import pytest
|
||||
|
||||
# Internal Packages
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The localize method is no longer necessary.")
|
||||
def test_date_filter():
|
||||
entries = [
|
||||
Entry(compiled="", raw="Entry with no date"),
|
||||
@@ -46,6 +50,7 @@ def test_date_filter():
|
||||
assert entry_indices == {1, 2}
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The localize method is no longer necessary.")
|
||||
def test_extract_date_range():
|
||||
assert DateFilter().extract_date_range('head dt>"1984-01-04" dt<"1984-01-07" tail') == [
|
||||
datetime(1984, 1, 5, 0, 0, 0).timestamp(),
|
||||
@@ -57,6 +62,10 @@ def test_extract_date_range():
|
||||
datetime(1984, 1, 1, 0, 0, 0).timestamp(),
|
||||
datetime(1984, 1, 2, 0, 0, 0).timestamp(),
|
||||
]
|
||||
assert DateFilter().extract_date_range('head dt="1984-01-01"') == [
|
||||
datetime(1984, 1, 1, 0, 0, 0).timestamp(),
|
||||
datetime(1984, 1, 2, 0, 0, 0).timestamp(),
|
||||
]
|
||||
|
||||
# Unparseable date filter specified in query
|
||||
assert DateFilter().extract_date_range('head dt:"Summer of 69" tail') == None
|
||||
@@ -68,6 +77,7 @@ def test_extract_date_range():
|
||||
assert DateFilter().extract_date_range('head dt>"1984-01-01" dt<"1984-01-01" tail') == None
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The localize method is no longer necessary.")
|
||||
def test_parse():
|
||||
test_now = datetime(1984, 4, 1, 21, 21, 21)
|
||||
|
||||
|
||||
82
tests/test_jsonl_to_jsonl.py
Normal file
82
tests/test_jsonl_to_jsonl.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.utils.jsonl import load_jsonl
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
def test_process_entries_from_single_input_jsonl(tmp_path):
|
||||
"Convert multiple jsonl entries from single file to entries."
|
||||
# Arrange
|
||||
input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}
|
||||
{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}
|
||||
"""
|
||||
input_jsonl_file = create_file(tmp_path, input_jsonl)
|
||||
|
||||
# Act
|
||||
# Process Each Entry from All Notes Files
|
||||
input_jsons = JsonlToJsonl.extract_jsonl_entries([input_jsonl_file])
|
||||
entries = list(map(Entry.from_dict, input_jsons))
|
||||
output_jsonl = JsonlToJsonl.convert_entries_to_jsonl(entries)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert output_jsonl == input_jsonl
|
||||
|
||||
|
||||
def test_process_entries_from_multiple_input_jsonls(tmp_path):
|
||||
"Convert multiple jsonl entries from single file to entries."
|
||||
# Arrange
|
||||
input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}"""
|
||||
input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}"""
|
||||
input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
|
||||
input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")
|
||||
|
||||
# Act
|
||||
# Process Each Entry from All Notes Files
|
||||
input_jsons = JsonlToJsonl.extract_jsonl_entries([input_jsonl_file_1, input_jsonl_file_2])
|
||||
entries = list(map(Entry.from_dict, input_jsons))
|
||||
output_jsonl = JsonlToJsonl.convert_entries_to_jsonl(entries)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert output_jsonl == f"{input_jsonl_1}\n{input_jsonl_2}\n"
|
||||
|
||||
|
||||
def test_get_jsonl_files(tmp_path):
|
||||
"Ensure JSONL files specified via input-filter, input-files extracted"
|
||||
# Arrange
|
||||
# Include via input-filter globs
|
||||
group1_file1 = create_file(tmp_path, filename="group1-file1.jsonl")
|
||||
group1_file2 = create_file(tmp_path, filename="group1-file2.jsonl")
|
||||
group2_file1 = create_file(tmp_path, filename="group2-file1.jsonl")
|
||||
group2_file2 = create_file(tmp_path, filename="group2-file2.jsonl")
|
||||
# Include via input-file field
|
||||
file1 = create_file(tmp_path, filename="notes.jsonl")
|
||||
# Not included by any filter
|
||||
create_file(tmp_path, filename="not-included-jsonl.jsonl")
|
||||
create_file(tmp_path, filename="not-included-text.txt")
|
||||
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / "notes.jsonl"]
|
||||
input_filter = [tmp_path / "group1*.jsonl", tmp_path / "group2*.jsonl"]
|
||||
|
||||
# Act
|
||||
extracted_org_files = JsonlToJsonl.get_jsonl_files(input_files, input_filter)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_org_files) == 5
|
||||
assert extracted_org_files == expected_files
|
||||
|
||||
|
||||
# Helper Functions
|
||||
def create_file(tmp_path, entry=None, filename="test.jsonl"):
|
||||
jsonl_file = tmp_path / filename
|
||||
jsonl_file.touch()
|
||||
if entry:
|
||||
jsonl_file.write_text(entry)
|
||||
return jsonl_file
|
||||
@@ -1,5 +1,6 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
@@ -66,16 +67,17 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
||||
)
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 2
|
||||
# Ensure entry compiled strings include the markdown files they originate from
|
||||
assert all([markdownfile.stem in entry.compiled for entry in entries])
|
||||
|
||||
|
||||
def test_get_markdown_files(tmp_path):
|
||||
|
||||
@@ -44,7 +44,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||
# Arrange
|
||||
entry = f"""*** Heading
|
||||
\t\r
|
||||
Body Line 1
|
||||
Body Line
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
@@ -108,6 +108,30 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||
assert len(jsonl_data) == 1
|
||||
|
||||
|
||||
def test_file_with_entry_after_intro_text_to_jsonl(tmp_path):
|
||||
"Ensure intro text before any headings is indexed."
|
||||
# Arrange
|
||||
entry = f"""
|
||||
Intro text
|
||||
|
||||
* Entry Heading
|
||||
entry body
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 2
|
||||
|
||||
|
||||
def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||
"Ensure files with no heading, only body text are loaded."
|
||||
# Arrange
|
||||
|
||||
@@ -268,7 +268,7 @@ def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path):
|
||||
# Arrange
|
||||
entry = f"""#+TITLE: title1
|
||||
Body Line 1
|
||||
#+TITLE: title2 """
|
||||
#+TITLE: title2 """
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
# Act
|
||||
@@ -286,6 +286,50 @@ Body Line 1
|
||||
assert entries[0].deadline == ""
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_parse_org_with_intro_text_before_heading(tmp_path):
|
||||
"Test parsing of org file with intro text before heading"
|
||||
# Arrange
|
||||
body = f"""#+TITLE: Title
|
||||
intro body
|
||||
* Entry Heading
|
||||
entry body
|
||||
"""
|
||||
orgfile = create_file(tmp_path, body)
|
||||
|
||||
# Act
|
||||
entries = orgnode.makelist(orgfile)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert entries[0].heading == "Title"
|
||||
assert entries[0].body == "intro body\n"
|
||||
assert entries[1].heading == "Entry Heading"
|
||||
assert entries[1].body == "entry body\n"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_parse_org_with_intro_text_multiple_titles_and_heading(tmp_path):
|
||||
"Test parsing of org file with intro text, multiple titles and heading entry"
|
||||
# Arrange
|
||||
body = f"""#+TITLE: Title1
|
||||
intro body
|
||||
* Entry Heading
|
||||
entry body
|
||||
#+TITLE: Title2 """
|
||||
orgfile = create_file(tmp_path, body)
|
||||
|
||||
# Act
|
||||
entries = orgnode.makelist(orgfile)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert entries[0].heading == "Title1 Title2"
|
||||
assert entries[0].body == "intro body\n"
|
||||
assert entries[1].heading == "Entry Heading"
|
||||
assert entries[1].body == "entry body\n"
|
||||
|
||||
|
||||
# Helper Functions
|
||||
def create_file(tmp_path, entry, filename="test.org"):
|
||||
org_file = tmp_path / f"notes/{filename}"
|
||||
|
||||
29
tests/test_rawconfig.py
Normal file
29
tests/test_rawconfig.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# External Packages
|
||||
import pytest
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.rawconfig import TextContentConfig, ImageContentConfig
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_input_file_or_filter_required_in_text_content_config():
|
||||
# Act
|
||||
with pytest.raises(ValueError):
|
||||
TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=None,
|
||||
compressed_jsonl="notes.jsonl",
|
||||
embeddings_file="note_embeddings.pt",
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_input_filter_or_directories_required_in_image_content_config():
|
||||
# Act
|
||||
with pytest.raises(ValueError):
|
||||
ImageContentConfig(
|
||||
input_directories=None,
|
||||
input_filter=None,
|
||||
embeddings_file="note_embeddings.pt",
|
||||
)
|
||||
@@ -1,5 +1,4 @@
|
||||
# System Packages
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
|
||||
# External Packages
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
{
|
||||
"0.2.1": "0.15.0",
|
||||
"0.2.5": "0.15.0",
|
||||
"0.2.6": "0.15.0"
|
||||
"0.2.1": "0.15.0",
|
||||
"0.2.5": "0.15.0",
|
||||
"0.2.6": "0.15.0",
|
||||
"0.3.0": "0.15.0",
|
||||
"0.4.0": "0.15.0",
|
||||
"0.5.0": "0.15.0",
|
||||
"0.6.0": "0.15.0"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user