retrieve_chunks

Sync
Async

def retrieve_chunks(
    query: Optional[str] = None,
    filters: Optional[Dict[str, Any]] = None,
    k: int = 4,
    min_score: float = 0.0,
    use_colpali: bool = True,
    folder_name: Optional[Union[str, List[str]]] = None,
    folder_depth: Optional[int] = None,
    padding: int = 0,
    output_format: Optional[str] = None,
    query_image: Optional[str] = None,
) -> List[FinalChunkResult]

async def retrieve_chunks(
    query: Optional[str] = None,
    filters: Optional[Dict[str, Any]] = None,
    k: int = 4,
    min_score: float = 0.0,
    use_colpali: bool = True,
    folder_name: Optional[Union[str, List[str]]] = None,
    folder_depth: Optional[int] = None,
    padding: int = 0,
    output_format: Optional[str] = None,
    query_image: Optional[str] = None,
) -> List[FinalChunkResult]

Parameters

query (str, optional): Search query text. Mutually exclusive with query_image.
filters (Dict[str, Any], optional): Optional metadata filters
k (int, optional): Number of results. Defaults to 4.
min_score (float, optional): Minimum similarity threshold. Defaults to 0.0.
use_colpali (bool, optional): Whether to use ColPali-style embedding model to retrieve the chunks (only works for documents ingested with use_colpali=True). Defaults to True.
folder_name (str | List[str], optional): Optional folder scope. Accepts canonical paths (e.g., /projects/alpha/specs) or a list of paths/names.
folder_depth (int, optional): Folder scope depth. None/0 = exact match, -1 = include all descendants, n > 0 = include descendants up to n levels deep.
padding (int, optional): Number of additional chunks/pages to retrieve before and after matched chunks (ColPali only). Defaults to 0.
output_format (str, optional): Controls how image chunks are returned:
- "base64" (default): Returns base64-encoded image data
- "url": Returns presigned HTTPS URLs
- "text": Converts images to markdown text via OCR
query_image (str, optional): Base64-encoded image for reverse image search. Mutually exclusive with query. Requires use_colpali=True.

Metadata Filters

Filters follow the same JSON syntax across the API. See the Metadata Filtering guide for supported operators and typed comparisons. Example:

filters = {
    "$and": [
        {"department": {"$eq": "research"}},
        {"priority": {"$gte": 40}},
        {"start_date": {"$lte": "2024-06-01T00:00:00Z"}}
    ]
}

chunks = db.retrieve_chunks("delta status", filters=filters, k=6)

Returns

List[FinalChunkResult]: List of chunk results

Examples

Sync
Async

from morphik import Morphik

db = Morphik()

chunks = db.retrieve_chunks(
    "What are the key findings?",
    filters={"department": "research"},
    k=5,
    min_score=0.5,
    padding=1,
    output_format="url",  # Return image chunks as presigned URLs
)

nested_chunks = db.retrieve_chunks(
    "design decisions",
    folder_name="/projects/alpha",
    folder_depth=-1,  # include nested child folders
)

for chunk in chunks:
    print(f"Score: {chunk.score}")
    # For image chunks with output_format="url", content will be a URL string
    print(f"Content: {chunk.content}")
    print(f"Document ID: {chunk.document_id}")
    print(f"Chunk Number: {chunk.chunk_number}")
    print(f"Metadata: {chunk.metadata}")
    print("---")

from morphik import AsyncMorphik

async with AsyncMorphik() as db:
    chunks = await db.retrieve_chunks(
        "What are the key findings?",
        filters={"department": "research"},
        k=5,
        min_score=0.5,
        padding=1,
        output_format="url",  # Return image chunks as presigned URLs
    )

    nested_chunks = await db.retrieve_chunks(
        "design decisions",
        folder_name="/projects/alpha",
        folder_depth=-1,
    )
    
    for chunk in chunks:
        print(f"Score: {chunk.score}")
        # For image chunks with output_format="url", content will be a URL string
        print(f"Content: {chunk.content}")
        print(f"Document ID: {chunk.document_id}")
        print(f"Chunk Number: {chunk.chunk_number}")
        print(f"Metadata: {chunk.metadata}")
        print("---")

FinalChunkResult Properties

The FinalChunkResult objects returned by this method have the following properties:

content (str | PILImage): Chunk content (text or image)
score (float): Relevance score
document_id (str): Parent document ID
chunk_number (int): Chunk sequence number
metadata (Dict[str, Any]): Document metadata
content_type (str): Content type
filename (Optional[str]): Original filename
download_url (Optional[str]): URL to download full document

Output Format Options

"base64" (default): Image chunks are returned as base64 data (the SDK attempts to decode these into a PIL.Image for FinalChunkResult.content).
"url": Image chunks are returned as presigned HTTPS URLs in content. This is convenient for UIs and LLMs that accept remote image URLs (e.g., via image_url).
"text": Image chunks are converted to markdown text via OCR. Use this when you need faster inference or when documents are mostly text-based.
Text chunks are unaffected by output_format and are always returned as strings.
The download_url field may be populated for image chunks. When using output_format="url", it will typically match content for those chunks.

When to Use Each Format

Format	Best For
`base64`	Direct image processing, local applications
`url`	Web UIs, LLMs with vision capabilities (lighter on network)
`text`	Faster inference, text-heavy documents, context length concerns

base64 vs url: Both formats pass images to LLMs for visual understanding and produce similar results. However, url is lighter on network transfer since only the URL is sent to your application (the LLM fetches the image directly). This can result in faster response times, especially with multiple images.When to use text: Passing images to LLMs for inference can be slow and consume significant context tokens. Use output_format="text" when you need faster inference speeds or when your documents are primarily text-based.If you’re hitting context limits with images, it may be because they aren’t being passed correctly to the model. See Generating Completions with Retrieved Chunks for examples of properly passing images (both base64 and URLs) to vision-capable models like GPT-4o.

Tip: To download the original raw file for a document, use get_document_download_url.

Reverse Image Search

You can search using an image instead of text by providing query_image with a base64-encoded image. This enables finding visually similar content in your documents.

Sync
Async

import base64
from morphik import Morphik

db = Morphik()

# Load and encode your query image
with open("query_image.png", "rb") as f:
    image_b64 = base64.b64encode(f.read()).decode("utf-8")

# Search using the image
chunks = db.retrieve_chunks(
    query_image=image_b64,
    use_colpali=True,  # Required for image queries
    k=5,
)

for chunk in chunks:
    print(f"Score: {chunk.score}")
    print(f"Document ID: {chunk.document_id}")
    print("---")

import base64
from morphik import AsyncMorphik

async with AsyncMorphik() as db:
    # Load and encode your query image
    with open("query_image.png", "rb") as f:
        image_b64 = base64.b64encode(f.read()).decode("utf-8")

    # Search using the image
    chunks = await db.retrieve_chunks(
        query_image=image_b64,
        use_colpali=True,  # Required for image queries
        k=5,
    )

    for chunk in chunks:
        print(f"Score: {chunk.score}")
        print(f"Document ID: {chunk.document_id}")
        print("---")

Reverse image search requires documents to be ingested with use_colpali=True. You must provide either query or query_image, but not both.

Client

Document Ingestion

Document Retrieval

Data Organization

Document Updates

Batch Operations

Chat & Conversation Management

Document Management

Apps & Tokens

Ops & Monitoring

Parameters

Metadata Filters

Returns

Examples

FinalChunkResult Properties

Output Format Options

When to Use Each Format

Reverse Image Search

Client

Document Ingestion

Document Retrieval

Data Organization

Document Updates

Batch Operations

Chat & Conversation Management

Document Management

Apps & Tokens

Ops & Monitoring

​Parameters

​Metadata Filters

​Returns

​Examples

​FinalChunkResult Properties

​Output Format Options

​When to Use Each Format

​Reverse Image Search

Parameters

Metadata Filters

Returns

Examples

FinalChunkResult Properties

Output Format Options

When to Use Each Format

Reverse Image Search