A professional, extensible Python package for extracting text from multiple file formats with both synchronous and asynchronous support.
- 🔄 Dual Input Support: Works with file paths or raw bytes
- ⚡ Sync & Async APIs: Choose the right approach for your use case
- 📁 Multiple Formats: PDF, DOCX, DOC, TXT, ZIP, Markdown, RTF, HTML, CSV, JSON, XML
- 🎯 Optional Dependencies: Install only what you need
- 🛡️ Robust Error Handling: Comprehensive exception hierarchy
- 📊 Professional Logging: Detailed debug and info level logging
- 🔒 Thread-Safe: Async operations use thread pools for I/O-bound tasks
- 🧹 Context Manager Support: Automatic resource cleanup
For complete documentation, including installation instructions, usage examples, and API reference, please visit our documentation site.
pip install textxtract# Install support for specific formats
pip install textxtract[pdf] # PDF support
pip install textxtract[docx] # Word documents
pip install textxtract[all] # All supported formats
# Multiple formats
pip install textxtract[pdf,docx,html]from textxtract import SyncTextExtractor
extractor = SyncTextExtractor()
# Extract from file path
text = extractor.extract("document.pdf")
print(text)
# Extract from bytes (filename required for type detection)
with open("document.pdf", "rb") as f:
file_bytes = f.read()
text = extractor.extract(file_bytes, "document.pdf")
print(text)from textxtract import AsyncTextExtractor
import asyncio
async def extract_text():
extractor = AsyncTextExtractor()
# Extract from file path
text = await extractor.extract("document.pdf")
return text
# Run async extraction
text = asyncio.run(extract_text())
print(text)# Automatic resource cleanup
with SyncTextExtractor() as extractor:
text = extractor.extract("document.pdf")
# Async context manager
async with AsyncTextExtractor() as extractor:
text = await extractor.extract("document.pdf")| Format | Extensions | Dependencies | Installation |
|---|---|---|---|
| Text | .txt, .text |
Built-in | pip install textxtract |
| Markdown | .md |
Optional | pip install textxtract[md] |
.pdf |
Optional | pip install textxtract[pdf] |
|
| Word | .docx |
Optional | pip install textxtract[docx] |
| Word Legacy | .doc |
Optional | pip install textxtract[doc] |
| Rich Text | .rtf |
Optional | pip install textxtract[rtf] |
| HTML | .html, .htm |
Optional | pip install textxtract[html] |
| CSV | .csv |
Built-in | pip install textxtract |
| JSON | .json |
Built-in | pip install textxtract |
| XML | .xml |
Optional | pip install textxtract[xml] |
| ZIP | .zip |
Built-in | pip install textxtract |
from textxtract import SyncTextExtractor
from textxtract.exceptions import (
FileTypeNotSupportedError,
InvalidFileError,
ExtractionError
)
extractor = SyncTextExtractor()
try:
text = extractor.extract("document.pdf")
print(text)
except FileTypeNotSupportedError:
print("❌ File type not supported")
except InvalidFileError:
print("❌ File is invalid or corrupted")
except ExtractionError:
print("❌ Extraction failed")from textxtract import SyncTextExtractor
from textxtract import ExtractorConfig
# Custom configuration
config = ExtractorConfig(
encoding="utf-8",
max_file_size=50 * 1024 * 1024, # 50MB limit
logging_level="DEBUG"
)
extractor = SyncTextExtractor(config)
text = extractor.extract("document.pdf")import asyncio
from pathlib import Path
from textxtract import AsyncTextExtractor
async def process_files(file_paths):
async with AsyncTextExtractor() as extractor:
# Process files concurrently
tasks = [extractor.extract(path) for path in file_paths]
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
# Process multiple files
files = [Path("doc1.pdf"), Path("doc2.docx"), Path("doc3.txt")]
results = asyncio.run(process_files(files))
for file, result in zip(files, results):
if isinstance(result, Exception):
print(f"❌ {file}: {result}")
else:
print(f"✅ {file}: {len(result)} characters extracted")import logging
from textxtract import SyncTextExtractor
# Enable debug logging
logging.basicConfig(level=logging.DEBUG)
extractor = SyncTextExtractor()
text = extractor.extract("document.pdf") # Will show detailed logs# Install test dependencies
pip install textxtract[all] pytest pytest-asyncio
# Run tests
pytest
# Run with coverage
pytest --cov=textxtract- 📖 Complete Documentation
- 🚀 Installation Guide
- 📘 Usage Examples
- 🔍 API Reference
- 🧪 Testing Guide
- 🤝 Contributing Guide
from textxtract import SyncTextExtractor
def process_document(file_path):
extractor = SyncTextExtractor()
text = extractor.extract(file_path)
# Process extracted text
word_count = len(text.split())
return {
"file": file_path,
"text": text,
"word_count": word_count
}import asyncio
from textxtract import AsyncTextExtractor
async def analyze_content(files):
async with AsyncTextExtractor() as extractor:
results = []
for file in files:
try:
text = await extractor.extract(file)
# Perform analysis
analysis = {
"file": file,
"length": len(text),
"words": len(text.split()),
"contains_email": "@" in text
}
results.append(analysis)
except Exception as e:
results.append({"file": file, "error": str(e)})
return resultsfrom textxtract import SyncTextExtractor
def extract_and_store(file_path, database):
extractor = SyncTextExtractor()
try:
text = extractor.extract(file_path)
# Store in database
database.store({
"file_path": str(file_path),
"content": text,
"extracted_at": datetime.now(),
"status": "success"
})
except Exception as e:
database.store({
"file_path": str(file_path),
"error": str(e),
"extracted_at": datetime.now(),
"status": "failed"
})- Python 3.9+
- Optional dependencies for specific file types
- See Installation Guide for details
We welcome contributions! Please see our Contributing Guide for details.
# Fork and clone the repo
git clone https://github.com/10XScale-in/textxtract.git
cd text-extractor
# Set up development environment
pip install -e .[all]
pip install pytest pytest-asyncio black isort mypy
# Run tests
pytest
# Format code
black textxtract tests
isort textxtract testsThis project is licensed under the MIT License - see the LICENSE file for details.
- 🐛 Bug Reports: GitHub Issues
- 💡 Feature Requests: GitHub Discussions
- 📧 Questions: GitHub Discussions
- Thanks to all contributors who have helped improve this project
- Built with Python and the amazing open-source ecosystem
- Special thanks to the maintainers of underlying libraries