Hybrid Search
Hybrid search combines vector similarity with full-text search and structured filtering, providing the best of all worlds for comprehensive search experiences.
Setup Hybrid Search
from pytidb import Field, TableModel, FullTextField
from pytidb.integrations import embed_fn
from sqlalchemy import JSON
class Document(TableModel):
__tablename__ = "documents"
id: int = Field(primary_key=True)
title: str = Field()
text: str = FullTextField() # Enable full-text search
text_vec: list[float] = embed_fn.VectorField(
source_field="text",
)
category: str = Field()
created_at: datetime = Field()
meta: dict = Field(sa_type=JSON)
# Create table with hybrid capabilities
table = db.create_table(schema=Document, if_exists="overwrite")
Hybrid Query Examples
Basic Hybrid Search
# Combine vector and text search
query = (
table.search("machine learning", search_type="hybrid")
.distance_threshold(0.8)
.fusion(method="rrf") # Reciprocal Rank Fusion
.limit(10)
.to_list()
)
Advanced Filtering
# Hybrid search with structured filtering
results = (
table.search("AI applications", search_type="hybrid")
.filter(Document.category == "technology")
.filter(Document.created_at >= datetime(2024, 1, 1))
.distance_threshold(0.7)
.fusion(method="weighted", vector_weight=0.7, text_weight=0.3)
.limit(20)
.to_list()
)
Fusion Methods
Reciprocal Rank Fusion (RRF)
RRF combines rankings from different search methods:
query = (
table.search("deep learning frameworks")
.search_type("hybrid")
.fusion(method="rrf", k=60) # RRF parameter
.limit(10)
)
Weighted Scoring
Manually control the importance of each search method:
query = (
table.search("neural networks")
.search_type("hybrid")
.fusion(
method="weighted",
vector_weight=0.6, # Semantic similarity weight
text_weight=0.3, # Full-text match weight
filter_weight=0.1 # Structured filter weight
)
)
Real-World Example
def search_knowledge_base(
query: str,
category: str = None,
date_range: tuple = None,
limit: int = 20
):
search_query = table.search(query, search_type="hybrid")
# Apply filters if provided
if category:
search_query = search_query.filter(Document.category == category)
if date_range:
start_date, end_date = date_range
search_query = search_query.filter(
Document.created_at.between(start_date, end_date)
)
# Execute hybrid search
results = (
search_query
.distance_threshold(0.75)
.fusion(method="rrf")
.limit(limit)
.to_list()
)
return [
{
"title": doc.title,
"text": doc.text[:200] + "...",
"category": doc.category,
"relevance_score": doc.distance,
"created_at": doc.created_at
}
for doc in results
]
Best Practices
- Balance Weights: Adjust fusion weights based on your use case
- Filter Early: Apply structured filters before vector/text search
- Test Different Methods: RRF vs weighted fusion for your data
- Monitor Performance: Track query latency and result quality