From f8f59377c58d8426fdb2437e93c3faca7a7e605b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikul=C3=A1=C5=A1=20Poul?= Date: Tue, 26 Aug 2025 15:38:41 +0100 Subject: [PATCH] Add README, verify examples from README works, bump version --- README.md | 222 ++++++++++++++++++ pyproject.toml | 3 +- tests/test_project/bookshop/__init__.py | 0 tests/test_project/bookshop/factories.py | 67 ++++++ .../bookshop/migrations/0001_initial.py | 80 +++++++ .../bookshop/migrations/__init__.py | 0 tests/test_project/bookshop/models.py | 30 +++ tests/test_project/settings.py | 1 + tests/test_readme.py | 65 +++++ tox.ini | 1 + 10 files changed, 468 insertions(+), 1 deletion(-) create mode 100644 tests/test_project/bookshop/__init__.py create mode 100644 tests/test_project/bookshop/factories.py create mode 100644 tests/test_project/bookshop/migrations/0001_initial.py create mode 100644 tests/test_project/bookshop/migrations/__init__.py create mode 100644 tests/test_project/bookshop/models.py create mode 100644 tests/test_readme.py diff --git a/README.md b/README.md index e69de29..bbcc756 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,222 @@ +# django-memoized-prefetch + +A Django package that provides efficient memoized prefetching for processing data in chunks, reducing database queries through intelligent caching. +In some cases it can be useful even when not processing data in chunks, for example, when there are multiple foreign keys to the same table. + +## Overview + +`django-memoized-prefetch` optimizes Django ORM queries when processing large datasets by: +- **Reusing previously fetched objects** across chunks +- **Memoizing prefetched objects** using LRU (Least Recently Used) cache +- **Supporting both foreign key and many-to-many relationships** +- **Minimizing database queries** across chunk processing operations + +## Installation + +```bash +pip install django-memoized-prefetch +``` + +## Requirements + +- Python 3.9+ +- Django 4.2+ +- lru-dict 1.3.0+ + +## Usage Examples + +
+ Models used in examples, click to expand + +```python +from django.db import models + +class Author(models.Model): + name = models.CharField(max_length=255) + email = models.EmailField() + +class Publisher(models.Model): + name = models.CharField(max_length=255) + country = models.CharField(max_length=100) + +class Category(models.Model): + name = models.CharField(max_length=100) + +class Book(models.Model): + title = models.CharField(max_length=255) + isbn = models.CharField(max_length=13) + author = models.ForeignKey(Author, on_delete=models.CASCADE, related_name="books") + translator = models.ForeignKey(Author, on_delete=models.CASCADE, related_name="translations", null=True) + publisher = models.ForeignKey(Publisher, on_delete=models.CASCADE, related_name="books") + categories = models.ManyToManyField(Category, related_name="books") + +class Review(models.Model): + book = models.ForeignKey(Book, on_delete=models.CASCADE, related_name="reviews") + rating = models.IntegerField() + comment = models.TextField() +``` + +
+ +### Basic Usage + +Imagine you want to process all books, but there are too many of them to load them all into memory at once. +You therefore need to process them in chunks. + +If you use just native django, it will look something like this: + +```python +from chunkator import chunkator_page + +for chunk in chunkator_page(Book.objects.all().prefetch_related("author", "translator", "publisher"), 10_000): + for book in chunk: + print(book.author.name, book.translator.name if book.translator is not None else None) + print(book.publisher.name) +``` + +This will work, with two caveats: +1. On each chunk, Django will make separate queries to fetch the author and translator +2. The author, translator and publisher objects will be fetched from the database for each chunk + +This is the primary usecase for this package. When used like this: + +```python +from django_memoized_prefetch import MemoizedPrefetch, MemoizedPrefetchConfig +from chunkator import chunkator_page + +memoized_prefetch = MemoizedPrefetch( + MemoizedPrefetchConfig(Author, ["author", "translator"]), + MemoizedPrefetchConfig(Publisher, ["publisher"], prefetch_all=True), +) + +for chunk in chunkator_page(Book.objects.all(), 10_000): + memoized_prefetch.process_chunk(chunk) + + for book in chunk: + print(book.author.name, book.translator.name if book.translator is not None else None) + print(book.publisher.name) +``` + +The processing will be more efficient, because: +1. All publishers will get fetched before processing any chunks, and they will be reused across all chunks +2. The author and translator objects will be fetched using one query +3. Any authors and translators that appeared in previous chunks will not be fetched again + +#### Nested attributes + +You can also prefetch nested attributes using both dotted notation and undersore notation, for example, in this example both would work. + +```python +memoized_prefetch = MemoizedPrefetch( + MemoizedPrefetchConfig(Publisher, ["book.publisher"]), + MemoizedPrefetchConfig(Author, ["book__author"]), +) + +for chunk in chunkator_page(Review.objects.all(), 10000): + memoized_prefetch.process_chunk(chunk) + ... +``` + +### Many-to-Many Relationships + +Many-to-many relationships are supported as well, caching the target model, while fetching the through model for each chunk. + +```python +from django_memoized_prefetch import MemoizedPrefetch, MemoizedPrefetchConfig +from chunkator import chunkator_page + +# Configure for many-to-many relationships +memoized_prefetch = MemoizedPrefetch( + MemoizedPrefetchConfig( + model=Category, + attributes=["categories"], + is_many_to_many=True, + through_model=Book.categories.through, + source_field="book_id", + target_field="category_id", + ) +) + +# Process books with their categories +for chunk in chunkator_page(Book.objects.all(), 10000): + memoized_prefetch.process_chunk(chunk) + + for book in chunk: + # Categories are prefetched and available + category_names = [cat.name for cat in book.categories.all()] + print(f"Book: {book.title}, Categories: {', '.join(category_names)}") +``` + +### Usage outside chunked processing + +If you have multiple foreign keys to the same table, this package can be used to optimise the database queries even when not processing data in chunks. + +## Configuration Options + +### MemoizedPrefetchConfig Parameters + +- **`model`** (required): The Django model class to prefetch +- **`attributes`** (required): List of attribute names to prefetch on your objects +- **`queryset`** (optional): Custom queryset for the model (for additional select_related/prefetch_related) +- **`prefetch_all`** (optional, default: False): Whether to prefetch all objects at initialisation +- **`lru_cache_size`** (optional, default: 10,000): Maximum number of objects to keep in cache +- **`is_many_to_many`** (optional, default: False): Set to True for many-to-many relationships +- **`through_model`** (optional): Through model for many-to-many relationships +- **`source_field`** (optional): Source field name in the through model +- **`target_field`** (optional): Target field name in the through model + +### Advanced Configuration + +```python +from django.db import models + +# Custom queryset with select_related +config = MemoizedPrefetchConfig( + model=Author, + attributes=["author"], + queryset=Author.objects.select_related(...), + lru_cache_size=5000, +) + +# Prefetch all objects at startup (useful for small, frequently accessed tables) +config = MemoizedPrefetchConfig( + model=Publisher, + attributes=["publisher"], + prefetch_all=True, +) +``` + +## Integrations with other packages. + +The package automatically supports `django-seal` when available, all querysets which are sealable will be automatically sealed. + +This package works when using `django-tenants`. + +## Best Practices + +1. **Use appropriate cache sizes**: Set `lru_cache_size` based on your expected data volume and available memory +2. **Prefetch related objects**: Use custom querysets with `select_related` or `prefetch_related` for nested relationships +3. **Consider prefetch_all**: Use `prefetch_all=True` for small, frequently accessed reference tables +4. **Process in reasonable chunks**: Balance memory usage with query efficiency when choosing chunk sizes +5. **Monitor cache hit rates**: Ensure your cache size is appropriate for your data access patterns + +## Testing + +Run the test suite: + +```bash +uv run pytest +``` + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + +## Authors + +- Mikuláš Poul (mikulas.poul@xelix.com) +- Cameron Hobbs (cameron.hobbs@xelix.com) diff --git a/pyproject.toml b/pyproject.toml index 0b9f423..40c6831 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "django-memoized-prefetch" -version = "0.1.0" +version = "0.1.1" description = "A memoized prefetch for Django." authors = [ {name = "Mikuláš Poul", email = "mikulas.poul@xelix.com"}, @@ -154,6 +154,7 @@ parametrize-names-type = "list" [dependency-groups] dev = [ "dirty-equals>=0.9.0", + "django-chunkator>=2.0.0", "django-seal>=1.7.1", "factory-boy>=3.3.3", "pytest-cov>=6.2.1", diff --git a/tests/test_project/bookshop/__init__.py b/tests/test_project/bookshop/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_project/bookshop/factories.py b/tests/test_project/bookshop/factories.py new file mode 100644 index 0000000..06080d6 --- /dev/null +++ b/tests/test_project/bookshop/factories.py @@ -0,0 +1,67 @@ +import random + +import factory + +from tests.test_project.bookshop.models import ( + Author, + Book, + Category, + Publisher, + Review, +) + + +class AuthorFactory(factory.django.DjangoModelFactory): + name = factory.Faker("name") + email = factory.Faker("email") + + class Meta: + model = Author + + +class PublisherFactory(factory.django.DjangoModelFactory): + name = factory.Faker("company") + country = factory.Faker("country") + + class Meta: + model = Publisher + + +class CategoryFactory(factory.django.DjangoModelFactory): + name = factory.Faker("word") + + class Meta: + model = Category + + +class BookFactory(factory.django.DjangoModelFactory): + title = factory.Faker("sentence", nb_words=4) + isbn = factory.Faker("isbn13") + author = factory.SubFactory(AuthorFactory) + translator = factory.SubFactory(AuthorFactory) + publisher = factory.SubFactory(PublisherFactory) + + @factory.post_generation + def categories(self, create: bool, extracted: list[Category]) -> None: + if not create: + return + + if extracted: + self.categories.set(extracted) + else: + # Create 1-3 random categories if none provided + categories = CategoryFactory.create_batch(random.randint(1, 3)) + self.categories.set(categories) + + class Meta: + model = Book + skip_postgeneration_save = True + + +class ReviewFactory(factory.django.DjangoModelFactory): + book = factory.SubFactory(BookFactory) + rating = factory.Faker("random_int", min=1, max=5) + comment = factory.Faker("text", max_nb_chars=500) + + class Meta: + model = Review diff --git a/tests/test_project/bookshop/migrations/0001_initial.py b/tests/test_project/bookshop/migrations/0001_initial.py new file mode 100644 index 0000000..23c41ab --- /dev/null +++ b/tests/test_project/bookshop/migrations/0001_initial.py @@ -0,0 +1,80 @@ +# Generated by Django 5.2.5 on 2025-08-26 14:29 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="Author", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("name", models.CharField(max_length=255)), + ("email", models.EmailField(max_length=254)), + ], + ), + migrations.CreateModel( + name="Category", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("name", models.CharField(max_length=100)), + ], + ), + migrations.CreateModel( + name="Publisher", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("name", models.CharField(max_length=255)), + ("country", models.CharField(max_length=100)), + ], + ), + migrations.CreateModel( + name="Book", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("title", models.CharField(max_length=255)), + ("isbn", models.CharField(max_length=13)), + ( + "author", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, related_name="books", to="bookshop.author" + ), + ), + ( + "translator", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="translations", + to="bookshop.author", + ), + ), + ("categories", models.ManyToManyField(related_name="books", to="bookshop.category")), + ( + "publisher", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, related_name="books", to="bookshop.publisher" + ), + ), + ], + ), + migrations.CreateModel( + name="Review", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("rating", models.IntegerField()), + ("comment", models.TextField()), + ( + "book", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, related_name="reviews", to="bookshop.book" + ), + ), + ], + ), + ] diff --git a/tests/test_project/bookshop/migrations/__init__.py b/tests/test_project/bookshop/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_project/bookshop/models.py b/tests/test_project/bookshop/models.py new file mode 100644 index 0000000..8483636 --- /dev/null +++ b/tests/test_project/bookshop/models.py @@ -0,0 +1,30 @@ +from django.db import models + + +class Author(models.Model): + name = models.CharField(max_length=255) + email = models.EmailField() + + +class Publisher(models.Model): + name = models.CharField(max_length=255) + country = models.CharField(max_length=100) + + +class Category(models.Model): + name = models.CharField(max_length=100) + + +class Book(models.Model): + title = models.CharField(max_length=255) + isbn = models.CharField(max_length=13) + author = models.ForeignKey(Author, on_delete=models.CASCADE, related_name="books") + translator = models.ForeignKey(Author, on_delete=models.CASCADE, related_name="translations", null=True) + publisher = models.ForeignKey(Publisher, on_delete=models.CASCADE, related_name="books") + categories = models.ManyToManyField(Category, related_name="books") + + +class Review(models.Model): + book = models.ForeignKey(Book, on_delete=models.CASCADE, related_name="reviews") + rating = models.IntegerField() + comment = models.TextField() diff --git a/tests/test_project/settings.py b/tests/test_project/settings.py index 5008846..2751d00 100644 --- a/tests/test_project/settings.py +++ b/tests/test_project/settings.py @@ -37,6 +37,7 @@ "django.contrib.messages", "django.contrib.staticfiles", "tests.test_project.test_app", + "tests.test_project.bookshop", "seal", ] diff --git a/tests/test_readme.py b/tests/test_readme.py new file mode 100644 index 0000000..f998c4f --- /dev/null +++ b/tests/test_readme.py @@ -0,0 +1,65 @@ +import pytest +from chunkator import chunkator_page + +from django_memoized_prefetch import MemoizedPrefetch, MemoizedPrefetchConfig +from tests.test_project.bookshop.factories import BookFactory, ReviewFactory +from tests.test_project.bookshop.models import Author, Book, Category, Publisher, Review + +pytestmark = pytest.mark.django_db + + +class TestReadmeExamples: + @pytest.fixture(autouse=True) + def setup(self): + BookFactory.create_batch(100) + ReviewFactory.create_batch(100) + + def test_basic_naive(self): + for chunk in chunkator_page(Book.objects.all().prefetch_related("author", "translator", "publisher"), 10_000): + for book in chunk: + print(book.author.name, book.translator.name if book.translator is not None else None) + print(book.publisher.name) + + def test_basic(self): + memoized_prefetch = MemoizedPrefetch( + MemoizedPrefetchConfig(Author, ["author", "translator"]), + MemoizedPrefetchConfig(Publisher, ["publisher"], prefetch_all=True), + ) + + for chunk in chunkator_page(Book.objects.all(), 10_000): + memoized_prefetch.process_chunk(chunk) + + for book in chunk: + print(book.author.name, book.translator.name if book.translator is not None else None) + print(book.publisher.name) + + def test_nested(self): + memoized_prefetch = MemoizedPrefetch( + MemoizedPrefetchConfig(Publisher, ["book.publisher"]), + MemoizedPrefetchConfig(Author, ["book__author"]), + ) + + for chunk in chunkator_page(Review.objects.all(), 10000): + memoized_prefetch.process_chunk(chunk) + + def test_m2m(self): + # Configure for many-to-many relationships + memoized_prefetch = MemoizedPrefetch( + MemoizedPrefetchConfig( + model=Category, + attributes=["categories"], + is_many_to_many=True, + through_model=Book.categories.through, + source_field="book_id", + target_field="category_id", + ) + ) + + # Process books with their categories + for chunk in chunkator_page(Book.objects.all(), 10000): + memoized_prefetch.process_chunk(chunk) + + for book in chunk: + # Categories are prefetched and available + category_names = [cat.name for cat in book.categories.all()] + print(f"Book: {book.title}, Categories: {', '.join(category_names)}") diff --git a/tox.ini b/tox.ini index aa570f5..04ce6ae 100644 --- a/tox.ini +++ b/tox.ini @@ -21,5 +21,6 @@ deps= dirty-equals django-seal factory-boy + django-chunkator commands= pytest