diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml index e4c3687a..f189fb20 100644 --- a/.github/workflows/_test.yml +++ b/.github/workflows/_test.yml @@ -10,49 +10,56 @@ jobs: build-test: strategy: matrix: - python-version: [3.11, 3.13] - platform: [ubuntu-latest, macos-latest] - runs-on: ${{ matrix.platform }} + python-version: [3.11] + platform: + - { runner: ubuntu-latest, python_exec: ".venv/bin/python" } + - { runner: ubuntu-24.04-arm, python_exec: ".venv/bin/python" } + - { runner: macos-latest, python_exec: ".venv/bin/python" } + - { runner: macos-13, python_exec: ".venv/bin/python" } + - { runner: windows-latest, python_exec: ".venv\\Scripts\\python" } + runs-on: ${{ matrix.platform.runner }} steps: - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + id: setup_python + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + - run: rustup toolchain install stable --profile minimal - name: Rust Cache uses: Swatinem/rust-cache@v2 with: - key: ${{ runner.os }}-rust-${{ matrix.python-version }} - - name: Rust build - run: cargo build --verbose + key: rust-${{ matrix.platform.runner }}-${{ matrix.python-version }} - name: Rust tests run: cargo test --verbose - - uses: actions/setup-python@v5 - id: setup_python - with: - python-version: ${{ matrix.python-version }} - cache: 'pip' - uses: actions/cache@v4 with: path: .venv - key: ${{ runner.os }}-pyenv-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('pyproject.toml') }} + key: pyenv-${{ matrix.platform.runner }}-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('pyproject.toml') }} restore-keys: | - ${{ runner.os }}-pyenv-${{ steps.setup_python.outputs.python-version }}- + pyenv-${{ matrix.platform.runner }}-${{ steps.setup_python.outputs.python-version }}- + - name: Setup venv run: | python -m venv .venv - name: Install Python toolchains run: | - source .venv/bin/activate - pip install maturin mypy pytest pytest-asyncio + ${{ matrix.platform.python_exec }} -m pip install maturin mypy pytest pytest-asyncio - name: Python build run: | - source .venv/bin/activate - maturin develop -E all + ${{ matrix.platform.python_exec }} -m maturin develop -E all - name: Python type check (mypy) run: | - source .venv/bin/activate - mypy python + ${{ matrix.platform.python_exec }} -m mypy python - name: Python tests + if: ${{ !startsWith(matrix.platform.runner, 'windows') }} + run: | + ${{ matrix.platform.python_exec }} -m pytest --capture=no python/cocoindex/tests + - name: Python tests (Windows cmd) + if: ${{ startsWith(matrix.platform.runner, 'windows') }} + shell: cmd # Use `cmd` to run test for Windows, as PowerShell doesn't detect exit code by `os._exit(0)` correctly. run: | - source .venv/bin/activate - pytest python/cocoindex/tests \ No newline at end of file + ${{ matrix.platform.python_exec }} -m pytest --capture=no python/cocoindex/tests \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 51b02e37..2b47fe72 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,8 +1,7 @@ -# This file is autogenerated by maturin v1.8.1 -# To update, run -# -# maturin generate-ci github +# This workflow can be triggered on tags push (automatic release) or manually on any branch. # +# - When triggered on tags push, it will build and publishes a new version including docs. +# - When triggered manually, it's a dry-run: only build, without publishing anything. name: release on: @@ -31,11 +30,11 @@ jobs: strategy: matrix: platform: - - { os: linux, runner: ubuntu-24.04, target: x86_64, container: "ghcr.io/rust-cross/manylinux_2_28-cross:x86_64" } - - { os: linux, runner: ubuntu-24.04, target: aarch64, container: "ghcr.io/rust-cross/manylinux_2_28-cross:aarch64" } - - { os: windows, runner: windows-latest, target: x64 } + - { os: linux, runner: ubuntu-latest, target: x86_64, container: "ghcr.io/rust-cross/manylinux_2_28-cross:x86_64" } + - { os: linux, runner: ubuntu-24.04-arm, target: aarch64, container: "ghcr.io/rust-cross/manylinux_2_28-cross:aarch64" } + - { os: macos, runner: macos-latest, target: aarch64 } - { os: macos, runner: macos-13, target: x86_64 } - - { os: macos, runner: macos-14, target: aarch64 } + - { os: windows, runner: windows-latest, target: x64 } steps: - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 @@ -43,12 +42,12 @@ jobs: name: Cargo.toml - uses: actions/setup-python@v5 with: - python-version: 3.x + python-version: 3.13 - name: Build wheels uses: PyO3/maturin-action@v1 with: target: ${{ matrix.platform.target }} - args: --release --out dist --find-interpreter + args: --release --out dist sccache: 'true' manylinux: auto container: ${{ matrix.platform.container }} @@ -58,6 +57,24 @@ jobs: name: wheels-${{ matrix.platform.os }}-${{ matrix.platform.target }} path: dist + test-abi3: + runs-on: ubuntu-24.04 + needs: build + strategy: + matrix: + py: ["3.11", "3.12", "3.13"] + steps: + - uses: actions/download-artifact@v4 + with: + name: wheels-linux-x86_64 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.py }} + - run: python -V + - run: pip install --find-links=./ cocoindex + - run: python -c "import cocoindex, sys; print('import ok on', sys.version)" + + sdist: runs-on: ubuntu-latest needs: [create-versioned-toml] @@ -80,7 +97,7 @@ jobs: release: name: Release runs-on: ubuntu-latest - needs: [create-versioned-toml, build, sdist] + needs: [create-versioned-toml, build, test-abi3, sdist] permissions: # Use to sign the release artifacts id-token: write @@ -111,5 +128,6 @@ jobs: release-docs: name: Release Docs needs: [release] + if: ${{ startsWith(github.ref, 'refs/tags/') }} uses: ./.github/workflows/_doc_release.yml secrets: inherit \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 89985a14..d160a16b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,12 @@ name = "cocoindex_engine" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.25.1", features = ["chrono", "auto-initialize", "uuid"] } +pyo3 = { version = "0.25.1", features = [ + "abi3-py311", + "auto-initialize", + "chrono", + "uuid", +] } pythonize = "0.25.0" pyo3-async-runtimes = { version = "0.25.0", features = ["tokio-runtime"] } diff --git a/docs/docs/examples/examples/simple_vector_index.md b/docs/docs/examples/examples/simple_vector_index.md index 5fdc4b9f..28162190 100644 --- a/docs/docs/examples/examples/simple_vector_index.md +++ b/docs/docs/examples/examples/simple_vector_index.md @@ -71,34 +71,21 @@ with data_scope["documents"].row() as doc: ### Embed each chunk ```python -@cocoindex.transform_flow() -def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]: - """ - Embed the text using a SentenceTransformer model. - This is a shared logic between indexing and querying, so extract it as a function. - """ - return text.transform( +with doc["chunks"].row() as chunk: + chunk["embedding"] = chunk["text"].transform( cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2")) + model="sentence-transformers/all-MiniLM-L6-v2" + ) + ) + doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], + text=chunk["text"], embedding=chunk["embedding"]) ``` -![Embedding](/img/examples/simple_vector_index/embed.png) - -This code defines a transformation function that converts text into vector embeddings using the SentenceTransformer model. -`@cocoindex.transform_flow()` is needed to share the transformation across indexing and query. -This decorator marks this as a reusable transformation flow that can be called on specific input data from user code using `eval()`, as shown in the search function below. The `MiniLM-L6-v2` model is a good balance of speed and quality for text embeddings, though you can swap in other SentenceTransformer models as needed. - -Plug in the `text_to_embedding` function and collect the embeddings. -```python -with doc["chunks"].row() as chunk: - chunk["embedding"] = text_to_embedding(chunk["text"]) - doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], - text=chunk["text"], embedding=chunk["embedding"]) -``` +![Embedding](/img/examples/simple_vector_index/embed.png) ## Export the embeddings @@ -119,10 +106,32 @@ CocoIndex supports other vector databases as well, with 1-line switch. ## Query the index +### Define a shared flow for both indexing and querying + +```python +@cocoindex.transform_flow() +def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]: + """ + Embed the text using a SentenceTransformer model. + This is a shared logic between indexing and querying, so extract it as a function. + """ + return text.transform( + cocoindex.functions.SentenceTransformerEmbed( + model="sentence-transformers/all-MiniLM-L6-v2")) +``` + +This code defines a transformation function that converts text into vector embeddings using the SentenceTransformer model. +`@cocoindex.transform_flow()` is needed to share the transformation across indexing and query. + +This decorator marks this as a reusable transformation flow that can be called on specific input data from user code using `eval()`, as shown in the search function below. + +### Write query + CocoIndex doesn't provide additional query interface at the moment. We can write SQL or rely on the query engine by the target storage, if any. + ```python def search(pool: ConnectionPool, query: str, top_k: int = 5): table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings") @@ -166,6 +175,19 @@ if __name__ == "__main__": _main() ``` +In the function above, most parts are standard query logic - you can use any libraries you like. +There're two CocoIndex-specific logic: + +1. Get the table name from the export target in the `text_embedding_flow` above. + Since the table name for the `Postgres` target is not explicitly specified in the `export()` call, + CocoIndex uses a default name. + `cocoindex.utils.get_target_default_name()` is a utility function to get the default table name for this case. + +2. Evaluate the transform flow defined above with the input query, to get the embedding. + It's done by the `eval()` method of the transform flow `text_to_embedding`. + The return type of this method is `NDArray[np.float32]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[NDArray[np.float32]]`). + + ## Time to have fun! - Run the following command to setup and update the index. diff --git a/docs/docs/getting_started/quickstart.md b/docs/docs/getting_started/quickstart.md index f9b2760c..6d0f1e49 100644 --- a/docs/docs/getting_started/quickstart.md +++ b/docs/docs/getting_started/quickstart.md @@ -3,281 +3,175 @@ title: Quickstart description: Get started with CocoIndex in 10 minutes --- -import ReactPlayer from 'react-player' +import { GitHubButton, YouTubeButton, DocumentationButton } from '../../src/components/GitHubButton'; -# Build your first CocoIndex project + + -This guide will help you get up and running with CocoIndex in just a few minutes. We'll build a project that does: -* Read files from a directory -* Perform basic chunking and embedding -* Load the data into a vector store (PG Vector) +In this tutorial, we’ll build an index with text embeddings, keeping it minimal and focused on the core indexing flow. - -## Prerequisite: Install CocoIndex environment +## Flow Overview +![Flow](/img/examples/simple_vector_index/flow.png) -We'll need to install a bunch of dependencies for this project. +1. Read text files from the local filesystem +2. Chunk each document +3. For each chunk, embed it with a text embedding model +4. Store the embeddings in a vector database for retrieval + +## Setup 1. Install CocoIndex: ```bash pip install -U 'cocoindex[embeddings]' ``` -2. You can skip this step if you already have a Postgres database with pgvector extension installed. - If not, the easiest way is to bring up a Postgres database using docker compose: - - - Make sure Docker Compose is installed: [docs](https://docs.docker.com/compose/install/) - - Start a Postgres SQL database for cocoindex using our docker compose config: - - ```bash - docker compose -f <(curl -L https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/dev/postgres.yaml) up -d - ``` - -## Step 1: Prepare directory for your project +2. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres). -1. Open the terminal and create a new directory for your project: +3. Create a new directory for your project: ```bash mkdir cocoindex-quickstart cd cocoindex-quickstart ``` -2. Prepare input files for the index. Put them in a directory, e.g. `markdown_files`. - If you don't have any files at hand, you may download the example [markdown_files.zip](markdown_files.zip) and unzip it in the current directory. +4. Place input files in a directory `markdown_files`. You may download from [markdown_files.zip](markdown_files.zip). -## Step 2: Define the indexing flow -Create a new file `quickstart.py` and import the `cocoindex` library: +## Define a flow -```python title="quickstart.py" -import cocoindex -``` +Create a new file `main.py` and define a flow. -Then we'll create the indexing flow as follows. +```python title="main.py" +import cocoindex -```python title="quickstart.py" @cocoindex.flow_def(name="TextEmbedding") def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): - # Add a data source to read files from a directory - data_scope["documents"] = flow_builder.add_source( - cocoindex.sources.LocalFile(path="markdown_files")) - - # Add a collector for data to be exported to the vector index - doc_embeddings = data_scope.add_collector() - - # Transform data of each document - with data_scope["documents"].row() as doc: - # Split the document into chunks, put into `chunks` field - doc["chunks"] = doc["content"].transform( - cocoindex.functions.SplitRecursively(), - language="markdown", chunk_size=2000, chunk_overlap=500) - - # Transform data of each chunk - with doc["chunks"].row() as chunk: - # Embed the chunk, put into `embedding` field - chunk["embedding"] = chunk["text"].transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2")) - - # Collect the chunk into the collector. - doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], - text=chunk["text"], embedding=chunk["embedding"]) - - # Export collected data to a vector index. - doc_embeddings.export( - "doc_embeddings", - cocoindex.targets.Postgres(), - primary_key_fields=["filename", "location"], - vector_indexes=[ - cocoindex.VectorIndexDef( - field_name="embedding", - metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + # ... See subsections below for function body ``` -Notes: - -1. The `@cocoindex.flow_def` declares a function to be a CocoIndex flow. +### Add Source and Collector -2. In CocoIndex, data is organized in different *data scopes*. - * `data_scope`, representing all data. - * `doc`, representing each row of `documents`. - * `chunk`, representing each row of `chunks`. +```python title="main.py" +# add source +data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.LocalFile(path="markdown_files")) -3. A *data source* extracts data from an external source. - In this example, the `LocalFile` data source imports local files as a KTable (table with key columns, see [KTable](../core/data_types#ktable) for details), each row has `"filename"` and `"content"` fields. +# add data collector +doc_embeddings = data_scope.add_collector() +``` -4. After defining the KTable, we extend a new field `"chunks"` to each row by *transforming* the `"content"` field using `SplitRecursively`. The output of the `SplitRecursively` is also a KTable representing each chunk of the document, with `"location"` and `"text"` fields. +`flow_builder.add_source` will create a table with sub fields (`filename`, `content`) -5. After defining the KTable, we extend a new field `"embedding"` to each row by *transforming* the `"text"` field using `SentenceTransformerEmbed`. + -6. In CocoIndex, a *collector* collects multiple entries of data together. In this example, the `doc_embeddings` collector collects data from all `chunk`s across all `doc`s, and uses the collected data to build a vector index `"doc_embeddings"`, using `Postgres`. + -## Step 3: Run the indexing pipeline and queries +### Process each document -Specify the database URL by environment variable: +With CocoIndex, it is easy to process nested data structures. -```bash -export COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@localhost:5432/cocoindex" +```python title="main.py" +with data_scope["documents"].row() as doc: + # ... See subsections below for function body ``` -Now we're ready to build the index: -```bash -cocoindex update --setup quickstart.py +#### Chunk each document + +```python title="main.py" +doc["chunks"] = doc["content"].transform( + cocoindex.functions.SplitRecursively(), + language="markdown", chunk_size=2000, chunk_overlap=500) ``` -If you run it the first time for this flow, CocoIndex will automatically create its persistent backends (tables in the database). -CocoIndex will ask you to confirm the action, enter `yes` to proceed. +We extend a new field `chunks` to each row by *transforming* the `content` field using `SplitRecursively`. The output of the `SplitRecursively` is a KTable representing each chunk of the document. -CocoIndex will run for a few seconds and populate the target table with data as declared by the flow. It will output the following statistics: + -``` -documents: 3 added, 0 removed, 0 updated -``` +![Chunking](/img/examples/simple_vector_index/chunk.png) -## Step 4 (optional): Run queries against the index -CocoIndex excels at transforming your data and storing it (a.k.a. indexing). -The goal of transforming your data is usually to query against it. -Once you already have your index built, you can directly access the transformed data in the target database. -CocoIndex also provides utilities for you to do this more seamlessly. -In this example, we'll use the [`psycopg` library](https://www.psycopg.org/) along with pgvector to connect to the database and run queries on vector data. -Please make sure the required packages are installed: +#### Embed each chunk and collect the embeddings -```bash -pip install numpy "psycopg[binary,pool]" pgvector +```python title="main.py" +with doc["chunks"].row() as chunk: + # embed + chunk["embedding"] = chunk["text"].transform( + cocoindex.functions.SentenceTransformerEmbed( + model="sentence-transformers/all-MiniLM-L6-v2" + ) + ) + + # collect + doc_embeddings.collect( + filename=doc["filename"], + location=chunk["location"], + text=chunk["text"], + embedding=chunk["embedding"], + ) ``` -### Step 4.1: Extract common transformations +This code embeds each chunk using the SentenceTransformer library and collects the results. + +![Embedding](/img/examples/simple_vector_index/embed.png) + + + +### Export the embeddings to Postgres + +```python title="main.py" +doc_embeddings.export( + "doc_embeddings", + cocoindex.storages.Postgres(), + primary_key_fields=["filename", "location"], + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, + ) + ], +) +``` -Between your indexing flow and the query logic, one piece of transformation is shared: compute the embedding of a text. -i.e. they should use exactly the same embedding model and parameters. +CocoIndex supports other vector databases as well, with 1-line switch. -Let's extract that into a function: + -```python title="quickstart.py" -from numpy.typing import NDArray -import numpy as np -@cocoindex.transform_flow() -def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[NDArray[np.float32]]: - return text.transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2")) -``` +## Run the indexing pipeline -`cocoindex.DataSlice[str]` represents certain data in the flow (e.g. a field in a data scope), with type `str` at runtime. -Similar to the `text_embedding_flow()` above, the `text_to_embedding()` is also to constructing the flow instead of directly doing computation, -so the type it takes is `cocoindex.DataSlice[str]` instead of `str`. -See [Data Slice](../core/flow_def#data-slice) for more details. +- Specify the database URL by environment variable: + ```bash + export COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@localhost:5432/cocoindex" + ``` -Then the corresponding code in the indexing flow can be simplified by calling this function: +- Build the index: -```python title="quickstart.py" -... -# Transform data of each chunk -with doc["chunks"].row() as chunk: - # Embed the chunk, put into `embedding` field - chunk["embedding"] = text_to_embedding(chunk["text"]) + ```bash + cocoindex update --setup main.py + ``` - # Collect the chunk into the collector. - doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], - text=chunk["text"], embedding=chunk["embedding"]) -... -``` +CocoIndex will run for a few seconds and populate the target table with data as declared by the flow. It will output the following statistics: -The function decorator `@cocoindex.transform_flow()` is used to declare a function as a CocoIndex transform flow, -i.e., a sub flow only performing transformations, without importing data from sources or exporting data to targets. -The decorator is needed for evaluating the flow with specific input data in Step 4.2 below. - -### Step 4.2: Provide the query logic - -Now we can create a function to query the index upon a given input query: - -```python title="quickstart.py" -from psycopg_pool import ConnectionPool -from pgvector.psycopg import register_vector - -def search(pool: ConnectionPool, query: str, top_k: int = 5): - # Get the table name, for the export target in the text_embedding_flow above. - table_name = cocoindex.utils.get_target_default_name(text_embedding_flow, "doc_embeddings") - # Evaluate the transform flow defined above with the input query, to get the embedding. - query_vector = text_to_embedding.eval(query) - # Run the query and get the results. - with pool.connection() as conn: - register_vector(conn) - with conn.cursor() as cur: - cur.execute(f""" - SELECT filename, text, embedding <=> %s AS distance - FROM {table_name} ORDER BY distance LIMIT %s - """, (query_vector, top_k)) - return [ - {"filename": row[0], "text": row[1], "score": 1.0 - row[2]} - for row in cur.fetchall() - ] ``` - -In the function above, most parts are standard query logic - you can use any libraries you like. -There're two CocoIndex-specific logic: - -1. Get the table name from the export target in the `text_embedding_flow` above. - Since the table name for the `Postgres` target is not explicitly specified in the `export()` call, - CocoIndex uses a default name. - `cocoindex.utils.get_target_default_name()` is a utility function to get the default table name for this case. - -2. Evaluate the transform flow defined above with the input query, to get the embedding. - It's done by the `eval()` method of the transform flow `text_to_embedding`. - The return type of this method is `NDArray[np.float32]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[NDArray[np.float32]]`). - -### Step 4.3: Add the main script logic - -Now we can add the main logic to the program. It uses the query function we just defined: - -```python title="quickstart.py" -if __name__ == "__main__": - # Initialize CocoIndex library states - cocoindex.init() - - # Initialize the database connection pool. - pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) - # Run queries in a loop to demonstrate the query capabilities. - while True: - try: - query = input("Enter search query (or Enter to quit): ") - if query == '': - break - # Run the query function with the database connection pool and the query. - results = search(pool, query) - print("\nSearch results:") - for result in results: - print(f"[{result['score']:.3f}] {result['filename']}") - print(f" {result['text']}") - print("---") - print() - except KeyboardInterrupt: - break +documents: 3 added, 0 removed, 0 updated ``` -It interacts with users and search the database by calling the `search()` method created in Step 4.2. +That's it for the main indexing flow. -### Step 4.4: Run queries against the index -Now we can run the same Python file, which will run the new added main logic: +## End to end: Query the index (Optional) -```bash -python quickstart.py -``` +If you want to build a end to end query flow that also searches the index, you can follow the [simple_vector_index](https://cocoindex.io/docs/examples/simple_vector_index#query-the-index) example. -It will ask you to enter a query and it will return the top 5 results. ## Next Steps Next, you may want to: * Learn about [CocoIndex Basics](../core/basics.md). -* Learn about other examples in the [examples](https://github.com/cocoindex-io/cocoindex/tree/main/examples) directory. - * The `text_embedding` example is this quickstart. - * Pick other examples to learn upon your interest. +* Explore more of what you can build with CocoIndex in the [examples](https://cocoindex.io/docs/examples) directory. diff --git a/docs/docs/ops/sources.md b/docs/docs/ops/sources.md index 1a936b93..bce063e9 100644 --- a/docs/docs/ops/sources.md +++ b/docs/docs/ops/sources.md @@ -313,6 +313,27 @@ The spec takes the following fields: * `included_columns` (`list[str]`, optional): non-primary-key columns to include. If not specified, all non-PK columns are included. * `ordinal_column` (`str`, optional): to specify a non-primary-key column used for change tracking and ordering, e.g. can be a modified timestamp or a monotonic version number. Supported types are integer-like (`bigint`/`integer`) and timestamps (`timestamp`, `timestamptz`). `ordinal_column` must not be a primary key column. +* `notification` (`cocoindex.sources.PostgresNotification`, optional): when present, enable change capture based on Postgres LISTEN/NOTIFY. It has the following fields: + * `channel_name` (`str`, optional): the Postgres notification channel to listen on. CocoIndex will automatically create the channel with the given name. If omitted, CocoIndex uses `{flow_name}__{source_name}__cocoindex`. + + :::info + + If `notification` is provided, CocoIndex listens for row changes using Postgres LISTEN/NOTIFY and creates the required database objects on demand when the flow starts listening: + + - Function to create notification message: `{channel_name}_n`. + - Trigger to react to table changes: `{channel_name}_t` on the specified `table_name`. + + Creation is automatic when listening begins. + + Currently CocoIndex doesn't automatically clean up these objects when the flow is dropped (unlike targets) + It's usually OK to leave them as they are, but if you want to clean them up, you can run the following SQL statements to manually drop them: + + ```sql + DROP TRIGGER IF EXISTS {channel_name}_t ON "{table_name}"; + DROP FUNCTION IF EXISTS {channel_name}_n(); + ``` + + ::: ### Schema diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts index dd2028cd..0e93fcc8 100644 --- a/docs/docusaurus.config.ts +++ b/docs/docusaurus.config.ts @@ -115,7 +115,7 @@ const config: Config = { { label: 'Documentation', type: 'doc', - docId: 'getting_started/overview', + docId: 'getting_started/quickstart', position: 'left', }, { diff --git a/docs/src/components/GitHubButton/index.tsx b/docs/src/components/GitHubButton/index.tsx index d5498bd5..d87892e8 100644 --- a/docs/src/components/GitHubButton/index.tsx +++ b/docs/src/components/GitHubButton/index.tsx @@ -35,7 +35,7 @@ type GitHubButtonProps = { margin?: string; }; -function GitHubButton({ url, margin }: GitHubButtonProps): ReactNode { +function GitHubButton({ url, margin = '0' }: GitHubButtonProps): ReactNode { return (