mirror of
https://github.com/datalab-to/chandra.git
synced 2026-05-13 15:45:46 +00:00
README updates
This commit is contained in:
105
FULL_BENCHMARKS.md
Normal file
105
FULL_BENCHMARKS.md
Normal file
@@ -0,0 +1,105 @@
|
||||
# Full 90-Language Benchmark
|
||||
|
||||
This is a comprehensive multilingual evaluation covering 90 languages, comparing Chandra 2 against Gemini 2.5 Flash. The average scores are lower than the [43-language benchmark](README.md#multilingual-benchmark-table) because this includes many lower-resource languages.
|
||||
|
||||
## Overall Scores
|
||||
|
||||
| | Chandra 2 | Gemini 2.5 Flash |
|
||||
|---|:---:|:---:|
|
||||
| **Average** | **72.7% +/- 1.2%** | **60.8% +/- 1.3%** |
|
||||
|
||||
## Results by Language
|
||||
|
||||
| Language | Chandra 2 | Gemini 2.5 Flash |
|
||||
|----------|:--------:|:----------------:|
|
||||
| af | 80.4% | 85.8% |
|
||||
| am | 34.4% | 0.5% |
|
||||
| ar | 68.4% | 84.4% |
|
||||
| as | 35.8% | 23.1% |
|
||||
| az | 75.2% | 74.0% |
|
||||
| be | 80.7% | 66.4% |
|
||||
| bg | 83.1% | 64.3% |
|
||||
| bn | 72.8% | 55.3% |
|
||||
| br | 90.0% | 69.4% |
|
||||
| bs | 84.8% | 85.1% |
|
||||
| ca | 85.1% | 88.0% |
|
||||
| cs | 85.3% | 79.1% |
|
||||
| cy | 82.2% | 77.6% |
|
||||
| da | 91.1% | 86.0% |
|
||||
| de | 94.8% | 88.3% |
|
||||
| el | 85.6% | 83.5% |
|
||||
| en | 96.6% | 90.3% |
|
||||
| eo | 80.1% | 71.9% |
|
||||
| es | 89.3% | 86.8% |
|
||||
| et | 75.2% | 73.7% |
|
||||
| eu | 80.2% | 74.6% |
|
||||
| fa | 75.1% | 61.8% |
|
||||
| fi | 83.4% | 86.0% |
|
||||
| fr | 93.7% | 86.1% |
|
||||
| fy | 81.2% | 70.1% |
|
||||
| ga | 80.9% | 70.1% |
|
||||
| gd | 71.8% | 59.5% |
|
||||
| gl | 80.9% | 80.9% |
|
||||
| gu | 70.8% | 47.6% |
|
||||
| ha | 72.1% | 59.1% |
|
||||
| he | 70.4% | 50.9% |
|
||||
| hi | 78.4% | 82.7% |
|
||||
| hr | 90.1% | 88.2% |
|
||||
| hu | 82.1% | 84.5% |
|
||||
| hy | 64.2% | 42.1% |
|
||||
| id | 91.6% | 88.3% |
|
||||
| is | 77.3% | 72.2% |
|
||||
| it | 94.6% | 85.7% |
|
||||
| ja | 86.9% | 80.0% |
|
||||
| jv | 73.2% | 80.4% |
|
||||
| ka | 77.0% | 39.3% |
|
||||
| kk | 80.5% | 77.2% |
|
||||
| km | 46.1% | 6.3% |
|
||||
| kn | 63.2% | 24.5% |
|
||||
| ko | 81.5% | 84.8% |
|
||||
| ku | 62.0% | 63.2% |
|
||||
| ky | 81.2% | 69.8% |
|
||||
| la | 73.8% | 70.5% |
|
||||
| lo | 60.9% | 13.3% |
|
||||
| lt | 79.8% | 70.5% |
|
||||
| lv | 76.9% | 81.5% |
|
||||
| mg | 81.2% | 78.4% |
|
||||
| mk | 83.5% | 77.4% |
|
||||
| ml | 64.3% | 23.8% |
|
||||
| mn | 88.4% | 71.4% |
|
||||
| mr | 75.0% | 69.7% |
|
||||
| ms | 79.3% | 79.8% |
|
||||
| my | 55.9% | 15.8% |
|
||||
| ne | 45.3% | 43.0% |
|
||||
| nl | 88.6% | 87.5% |
|
||||
| no | 90.5% | 87.8% |
|
||||
| or | 31.1% | 11.2% |
|
||||
| pa | 48.3% | 22.4% |
|
||||
| pl | 91.5% | 91.1% |
|
||||
| ps | 12.6% | 13.3% |
|
||||
| pt | 95.2% | 89.4% |
|
||||
| ro | 84.5% | 76.7% |
|
||||
| ru | 85.5% | 82.8% |
|
||||
| sa | 51.1% | 44.6% |
|
||||
| sd | 50.0% | 29.3% |
|
||||
| si | 62.4% | 26.2% |
|
||||
| sk | 77.3% | 81.2% |
|
||||
| sl | 81.0% | 80.1% |
|
||||
| so | 82.4% | 69.9% |
|
||||
| sq | 75.3% | 77.1% |
|
||||
| sr | 90.3% | 89.7% |
|
||||
| su | 85.7% | 96.4% |
|
||||
| sv | 93.3% | 91.1% |
|
||||
| sw | 88.9% | 80.9% |
|
||||
| ta | 77.7% | 53.9% |
|
||||
| te | 58.6% | 33.3% |
|
||||
| th | 62.6% | 66.7% |
|
||||
| tr | 84.1% | 84.1% |
|
||||
| ug | 25.8% | 5.4% |
|
||||
| uk | 91.0% | 87.9% |
|
||||
| ur | 44.1% | 57.6% |
|
||||
| uz | 77.2% | 52.8% |
|
||||
| vi | 82.6% | 89.5% |
|
||||
| xh | 82.1% | 62.1% |
|
||||
| yi | 24.9% | 6.8% |
|
||||
| zh | 88.7% | 70.0% |
|
||||
22
README.md
22
README.md
@@ -65,7 +65,7 @@ Multilingual performance was a focus for us with Chandra 2. There isn't a good
|
||||
|
||||
<img src="assets/benchmarks/multilingual.png" width="600px"/>
|
||||
|
||||
See full scores [below](#multilingual-benchmark-table).
|
||||
See full scores [below](#multilingual-benchmark-table). We also have a [full 90-language benchmark](FULL_BENCHMARKS.md).
|
||||
|
||||
We also benchmarked Chandra 2 with the widely accepted olmocr benchmark:
|
||||
|
||||
@@ -144,7 +144,7 @@ chandra ./documents ./output --method hf
|
||||
- `--max-workers INTEGER`: Parallel workers for vLLM
|
||||
- `--include-images/--no-images`: Extract and save images (default: include)
|
||||
- `--include-headers-footers/--no-headers-footers`: Include page headers/footers (default: exclude)
|
||||
- `--batch-size INTEGER`: Pages per batch (default: 1)
|
||||
- `--batch-size INTEGER`: Pages per batch (default: 28 for vllm, 1 for hf)
|
||||
|
||||
**Output Structure:**
|
||||
|
||||
@@ -152,7 +152,7 @@ Each processed file creates a subdirectory with:
|
||||
- `<filename>.md` - Markdown output
|
||||
- `<filename>.html` - HTML output
|
||||
- `<filename>_metadata.json` - Metadata (page info, token count, etc.)
|
||||
- `images/` - Extracted images from the document
|
||||
- Extracted images are saved directly in the output directory
|
||||
|
||||
### Streamlit Web App
|
||||
|
||||
@@ -176,7 +176,7 @@ This launches a Docker container with optimized inference settings. Configure vi
|
||||
- `VLLM_MODEL_NAME`: Model name for the server (default: `chandra`)
|
||||
- `VLLM_GPUS`: GPU device IDs (default: `0`)
|
||||
|
||||
You can also start your own vllm server with the `datalab-to/chandra` model.
|
||||
You can also start your own vllm server with the `datalab-to/chandra-ocr-2` model.
|
||||
|
||||
### Configuration
|
||||
|
||||
@@ -184,7 +184,7 @@ Settings can be configured via environment variables or a `local.env` file:
|
||||
|
||||
```bash
|
||||
# Model settings
|
||||
MODEL_CHECKPOINT=datalab-to/chandra
|
||||
MODEL_CHECKPOINT=datalab-to/chandra-ocr-2
|
||||
MAX_OUTPUT_TOKENS=8192
|
||||
|
||||
# vLLM settings
|
||||
@@ -218,6 +218,8 @@ This code is Apache 2.0, and our model weights use a modified OpenRAIL-M license
|
||||
|
||||
# Multilingual benchmark table
|
||||
|
||||
The table below covers the 43 most common languages, benchmarked across multiple models. For a comprehensive evaluation across 90 languages (Chandra 2 vs Gemini 2.5 Flash only), see the [full 90-language benchmark](#full-90-language-benchmark-table).
|
||||
|
||||
| Language | Datalab API | Chandra 2 | Chandra 1 | Gemini 2.5 Flash | GPT-5 Mini |
|
||||
|---|:---:|:---:|:---:|:---:|:---:|
|
||||
| ar | 67.6% | 68.4% | 34.0% | 84.4% | 55.6% |
|
||||
@@ -264,11 +266,17 @@ This code is Apache 2.0, and our model weights use a modified OpenRAIL-M license
|
||||
| zh | 87.8% | 88.7% | 88.3% | 70.0% | 70.4% |
|
||||
| **Average** | **80.4%** | **77.8%** | **69.4%** | **67.6%** | **60.5%** |
|
||||
|
||||
# Full 90-language benchmark table
|
||||
|
||||
We also have a more comprehensive evaluation covering 90 languages, comparing Chandra 2 against Gemini 2.5 Flash. The average scores are lower than the 43-language table above because this includes many lower-resource languages. Chandra 2 averages **72.7%** vs Gemini 2.5 Flash at **60.8%**.
|
||||
|
||||
See the [full 90-language results](FULL_BENCHMARKS.md).
|
||||
|
||||
# Credits
|
||||
|
||||
Thank you to the following open source projects:
|
||||
|
||||
- [Huggingface Transformers](https://github.com/huggingface/transformers)
|
||||
- [VLLM](https://github.com/vllm-project/vllm)
|
||||
- [olmocr](github.com/allenai/olmocr)
|
||||
- [Qwen 3 VL](https://github.com/QwenLM/Qwen3)
|
||||
- [olmocr](https://github.com/allenai/olmocr)
|
||||
- [Qwen 3.5](https://github.com/QwenLM/Qwen3)
|
||||
@@ -16,7 +16,7 @@ def generate_hf(
|
||||
if max_output_tokens is None:
|
||||
max_output_tokens = settings.MAX_OUTPUT_TOKENS
|
||||
|
||||
conversations = [[process_batch_element(item, bbox_scale)] for item in batch]
|
||||
conversations = [[process_batch_element(item)] for item in batch]
|
||||
|
||||
inputs = model.processor.apply_chat_template(
|
||||
conversations,
|
||||
@@ -45,12 +45,12 @@ def generate_hf(
|
||||
return results
|
||||
|
||||
|
||||
def process_batch_element(item: BatchInputItem, bbox_scale: int):
|
||||
def process_batch_element(item: BatchInputItem):
|
||||
prompt = item.prompt
|
||||
prompt_type = item.prompt_type
|
||||
|
||||
if not prompt:
|
||||
prompt = PROMPT_MAPPING[prompt_type].replace("{bbox_scale}", str(bbox_scale))
|
||||
prompt = PROMPT_MAPPING[prompt_type]
|
||||
|
||||
content = []
|
||||
image = scale_to_fit(item.image) # Guarantee max size
|
||||
|
||||
@@ -56,9 +56,7 @@ def generate_vllm(
|
||||
def _generate(item: BatchInputItem, temperature, top_p) -> GenerationResult:
|
||||
prompt = item.prompt
|
||||
if not prompt:
|
||||
prompt = PROMPT_MAPPING[item.prompt_type].replace(
|
||||
"{bbox_scale}", str(bbox_scale)
|
||||
)
|
||||
prompt = PROMPT_MAPPING[item.prompt_type]
|
||||
|
||||
content = []
|
||||
image = scale_to_fit(item.image)
|
||||
|
||||
@@ -27,7 +27,7 @@ def extract_images(html: str, chunks: dict, image: Image.Image):
|
||||
for idx, chunk in enumerate(chunks):
|
||||
div_idx += 1
|
||||
if chunk["label"] in ["Image", "Figure"]:
|
||||
img = chunk["content"].find("img")
|
||||
img = BeautifulSoup(chunk["content"], "html.parser").find("img")
|
||||
if not img:
|
||||
continue
|
||||
bbox = chunk["bbox"]
|
||||
|
||||
Reference in New Issue
Block a user