from typing import Optional
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
8 Build an Extraction Chain
Summary: How to extract information to Python class
From: https://python.langchain.com/v0.2/docs/tutorials/extraction/
8.1 Model
= ChatOpenAI(model="gpt-3.5-turbo-0125") llm
8.2 Single Extract
8.2.1 Schema
class Person(BaseModel):
"""Information about a person."""
# ^ Doc-string for the entity Person.
# This doc-string is sent to the LLM as the description of the schema Person,
# and it can help to improve extraction results.
# Note that:
# 1. Each field is an `optional` -- this allows the model to decline to extract it!
# 2. Each field has a `description` -- this description is used by the LLM.
# Having a good description can help improve extraction results.
str] = Field(default=None, description="The name of the person")
name: Optional[str] = Field(
hair_color: Optional[=None, description="The color of the person's hair if known"
default
)str] = Field(
height_in_meters: Optional[=None, description="Height measured in meters"
default )
= "B") Person(name
Person(name='B', hair_color=None, height_in_meters=None)
8.2.2 Extractor
from typing import Optional
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
# about the document from which the text was extracted.)
= ChatPromptTemplate.from_messages(
prompt
[
("system",
"You are an expert extraction algorithm. "
"Only extract relevant information from the text. "
"If you do not know the value of an attribute asked to extract, "
"return null for the attribute's value.",
),# Please see the how-to about improving performance with
# reference examples.
# MessagesPlaceholder('examples'),
"human", "{text}"),
(
] )
8.2.3 Chain & Execute
= prompt | llm.with_structured_output(schema=Person) runnable
= "Alan Smith is 6 feet tall and has blond hair."
text = runnable.invoke({"text": text})
res res
Person(name='Alan Smith', hair_color='blond', height_in_meters='1.83')
# Python Class
type(res)
__main__.Person
res.name
'Alan Smith'
8.3 Multiple Extract
8.3.1 Schema
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
class Person(BaseModel):
"""Information about a person."""
# ^ Doc-string for the entity Person.
# This doc-string is sent to the LLM as the description of the schema Person,
# and it can help to improve extraction results.
# Note that:
# 1. Each field is an `optional` -- this allows the model to decline to extract it!
# 2. Each field has a `description` -- this description is used by the LLM.
# Having a good description can help improve extraction results.
str] = Field(default=None, description="The name of the person")
name: Optional[str] = Field(
hair_color: Optional[=None, description="The color of the person's hair if known"
default
)str] = Field(
height_in_meters: Optional[=None, description="Height measured in meters"
default
)
class Data(BaseModel):
"""Extracted data about people."""
# Creates a model so that we can extract multiple entities.
people: List[Person]
8.3.2 Chain & Invoke
= prompt | llm.with_structured_output(schema=Data)
runnable = "My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me."
text = runnable.invoke({"text": text})
res2 res2
Data(people=[Person(name='Jeff', hair_color='black', height_in_meters='1.83'), Person(name='Anna', hair_color='black', height_in_meters=None)])
res2.people
[Person(name='Jeff', hair_color='black', height_in_meters='1.83'),
Person(name='Anna', hair_color='black', height_in_meters=None)]