[VPM] fix the parser to handle PDF files better: update requirements.txt

davidangularme · davidangularme · commit d7d947ea5f94 · 2026-04-13T17:09:14.000+03:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,16 @@
 huggingface_hub
 # LightRAG packages
 lightrag-hku
-# MinerU 2.0 packages (replaces magic-pdf)
+# MinerU 2.0 packages (replaces magic-pdf) - handles PDF parsing with multiple backends
 mineru[core]
 # Progress bars for batch processing
 tqdm
 # Note: Optional dependencies are now defined in setup.py extras_require:
 # - [image]: Pillow>=10.0.0 (for BMP, TIFF, GIF, WebP format conversion)
 # - [text]: reportlab>=4.0.0 (for TXT, MD to PDF conversion)
-# - [paddleocr]: paddleocr + pypdfium2 (for parser='paddleocr')
+# - [paddleocr]: paddleocr + pypdfium2 (for parser='paddleocr' - better OCR for scanned PDFs)
 # - [office]: requires LibreOffice (external program, not Python package)
 # - [all]: includes all optional dependencies
 #
 # Install with: pip install raganything[image,text] or pip install raganything[all]
+# For best PDF handling: pip install raganything[paddleocr]