Using Hypothetical Document Embeddinga (HyDE) to Improve Retrieval
Last Updated: October 3, 2024
π This cookbook has an accompanying article with a complete walkthrough “Optimizing Retrival with HyDE”
In this coookbook, we are building Haystack components that allow us to easily incorporate HyDE into our RAG pipelines, to optimize retrieval.
To learn more about HyDE and when it’s useful, check out our guide to Hypothetical Document Embeddings (HyDE)
Install Requirements
!pip install haystack-ai sentence-transformers datasets
In the following sections, we will be using the OpenAIGenerator
, so we need to provide our API key π
from getpass import getpass
import os
os.environ["OPENAI_API_KEY"] = getpass("Enter your openAI key:")
Building a Pipeline for Hypothetical Document Embeddings
We will build a Haystack pipeline that generates ‘fake’ documents.
For this part, we are using the OpenAIGenerator
with a PromptBuilder
that instructs the model to generate paragraphs.
from haystack.components.generators.openai import OpenAIGenerator
from haystack.components.builders import PromptBuilder
generator = OpenAIGenerator(
model="gpt-4o-mini",
generation_kwargs={"n": 5, "temperature": 0.75, "max_tokens": 400},
)
template="""Given a question, generate a paragraph of text that answers the question.
Question: {{question}}
Paragraph:"""
prompt_builder = PromptBuilder(template=template)
Next, we use the OutputAdapter
to transform the generated paragraphs into a List of Documents. This way, we will be able to use the SentenceTransformersDocumentEmbedder
to create embeddings, since this component expects List[Document]
from haystack import Document
from haystack.components.converters import OutputAdapter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from typing import List
adapter = OutputAdapter(
template="{{answers | build_doc}}",
output_type=List[Document],
custom_filters={"build_doc": lambda data: [Document(content=d) for d in data]}
)
embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
embedder.warm_up()
Finally, we create a custom component, HypotheticalDocumentEmbedder
, that expects documents
and can return a list of hypotethetical_embeddings
which is the average of the embeddings from the “hypothetical” (fake) documents. To learn more about this technique and where it’s useful, check out our
Guide to HyDE
from numpy import array, mean
from haystack import component
@component
class HypotheticalDocumentEmbedder:
@component.output_types(hypothetical_embedding=List[float])
def run(self, documents: List[Document]):
stacked_embeddings = array([doc.embedding for doc in documents])
avg_embeddings = mean(stacked_embeddings, axis=0)
hyde_vector = avg_embeddings.reshape((1, len(avg_embeddings)))
return {"hypothetical_embedding": hyde_vector[0].tolist()}
We add all of our components into a pipeline to genereate a hypothetical document embedding ππ
from haystack import Pipeline
hyde = HypotheticalDocumentEmbedder()
pipeline = Pipeline()
pipeline.add_component(name="prompt_builder", instance=prompt_builder)
pipeline.add_component(name="generator", instance=generator)
pipeline.add_component(name="adapter", instance=adapter)
pipeline.add_component(name="embedder", instance=embedder)
pipeline.add_component(name="hyde", instance=hyde)
pipeline.connect("prompt_builder", "generator")
pipeline.connect("generator.replies", "adapter.answers")
pipeline.connect("adapter.output", "embedder.documents")
pipeline.connect("embedder.documents", "hyde.documents")
query = "What should I do if I have a fever?"
result = pipeline.run(data={"prompt_builder": {"question": query}})
print(result["hyde"])
Build a HyDE Component That Encapsulates the Whole Logic
This section shows you how to create a HypotheticalDocumentEmbedder
that instead, encapsulates the entire logic, and also allows us to provide the embedding model as an optional parameter.
This “mega” components does a few things:
- Allows the user to pick the LLM which generates the hypothetical documents
- Allows users to define how many documents should be created with
nr_completions
- Allows users to define the embedding model they want to use to generate the HyDE embeddings.
from haystack import Pipeline, component, Document, default_to_dict, default_from_dict
from haystack.components.converters import OutputAdapter
from haystack.components.embedders.sentence_transformers_document_embedder import SentenceTransformersDocumentEmbedder
from haystack.components.generators.openai import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from typing import Dict, Any, List
from numpy import array, mean
from haystack.utils import Secret
@component
class HypotheticalDocumentEmbedder:
def __init__(
self,
instruct_llm: str = "gpt-4o-mini",
instruct_llm_api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
nr_completions: int = 5,
embedder_model: str = "sentence-transformers/all-MiniLM-L6-v2",
):
self.instruct_llm = instruct_llm
self.instruct_llm_api_key = instruct_llm_api_key
self.nr_completions = nr_completions
self.embedder_model = embedder_model
self.generator = OpenAIGenerator(
api_key=self.instruct_llm_api_key,
model=self.instruct_llm,
generation_kwargs={"n": self.nr_completions, "temperature": 0.75, "max_tokens": 400},
)
self.prompt_builder = PromptBuilder(
template="""Given a question, generate a paragraph of text that answers the question.
Question: {{question}}
Paragraph:
"""
)
self.adapter = OutputAdapter(
template="{{answers | build_doc}}",
output_type=List[Document],
custom_filters={"build_doc": lambda data: [Document(content=d) for d in data]},
)
self.embedder = SentenceTransformersDocumentEmbedder(model=embedder_model, progress_bar=False)
self.embedder.warm_up()
self.pipeline = Pipeline()
self.pipeline.add_component(name="prompt_builder", instance=self.prompt_builder)
self.pipeline.add_component(name="generator", instance=self.generator)
self.pipeline.add_component(name="adapter", instance=self.adapter)
self.pipeline.add_component(name="embedder", instance=self.embedder)
self.pipeline.connect("prompt_builder", "generator")
self.pipeline.connect("generator.replies", "adapter.answers")
self.pipeline.connect("adapter.output", "embedder.documents")
def to_dict(self) -> Dict[str, Any]:
data = default_to_dict(
self,
instruct_llm=self.instruct_llm,
instruct_llm_api_key=self.instruct_llm_api_key,
nr_completions=self.nr_completions,
embedder_model=self.embedder_model,
)
data["pipeline"] = self.pipeline.to_dict()
return data
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HypotheticalDocumentEmbedder":
hyde_obj = default_from_dict(cls, data)
hyde_obj.pipeline = Pipeline.from_dict(data["pipeline"])
return hyde_obj
@component.output_types(hypothetical_embedding=List[float])
def run(self, query: str):
result = self.pipeline.run(data={"prompt_builder": {"question": query}})
# return a single query vector embedding representing the average of the hypothetical document embeddings
stacked_embeddings = array([doc.embedding for doc in result["embedder"]["documents"]])
avg_embeddings = mean(stacked_embeddings, axis=0)
hyde_vector = avg_embeddings.reshape((1, len(avg_embeddings)))
return {"hypothetical_embedding": hyde_vector[0].tolist()}
Use HyDE For Retrieval
Let’s see how we can use this component in a full pipeline. First, let’s index some documents into an InMemoryDocumentStore
from datasets import load_dataset, Dataset
from haystack import Pipeline, Document
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
embedder_model = "sentence-transformers/all-MiniLM-L6-v2"
def index_docs(data: Dataset):
# create a data store and indexing pipeline with the components
document_store = InMemoryDocumentStore()
pipeline = Pipeline()
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=10))
pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(model=embedder_model))
pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy="skip"))
# connect the components
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
# index the documents and return the data store
pipeline.run({"cleaner": {"documents": [Document.from_dict(doc) for doc in data["train"]]}})
return document_store
data = load_dataset("Tuana/game-of-thrones")
doc_store = index_docs(data)
We can now run a retrieval pipeline that doesn’t just retrieve based on the query embeddings, instead, it uses the HypotheticalDocumentEmbedder
to create hypothetical document embeddings based on our query
and uses these new embeddings to retrieve documents.
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
def retriever_with_hyde(doc_store):
hyde = HypotheticalDocumentEmbedder(instruct_llm="gpt-4o-mini", nr_completions=5)
retriever = InMemoryEmbeddingRetriever(document_store=doc_store)
retrieval_pipeline = Pipeline()
retrieval_pipeline.add_component(instance=hyde, name="query_embedder")
retrieval_pipeline.add_component(instance=retriever, name="retriever")
retrieval_pipeline.connect("query_embedder.hypothetical_embedding", "retriever.query_embedding")
return retrieval_pipeline
retrieval_pipeline = retriever_with_hyde(doc_store)
query = "Who is Araya Stark?"
retrieval_pipeline.run(data={"query_embedder": {"query": query}, "retriever": {"top_k": 5}})