import sys
from pathlib import Path
1, str(Path.cwd().parent)) sys.path.insert(
29 Document Splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
=26
chunk_size = 4 chunk_overlap
= RecursiveCharacterTextSplitter(
r_splitter =chunk_size,
chunk_size=chunk_overlap
chunk_overlap
)= CharacterTextSplitter(
c_splitter =chunk_size,
chunk_size=chunk_overlap
chunk_overlap )
Why doesn’t this split the string below?
= 'abcdefghijklmnopqrstuvwxyz' text1
r_splitter.split_text(text1)
['abcdefghijklmnopqrstuvwxyz']
= 'abcdefghijklmnopqrstuvwxyzabcdefg' text2
r_splitter.split_text(text2)
['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']
Ok, this splits the string but we have an overlap specified as 5, but it looks like 3? (try an even number)
= "a b c d e f g h i j k l m n o p q r s t u v w x y z" text3
r_splitter.split_text(text3)
['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']
c_splitter.split_text(text3)
['a b c d e f g h i j k l m n o p q r s t u v w x y z']
= CharacterTextSplitter(
c_splitter =chunk_size,
chunk_size=chunk_overlap,
chunk_overlap= ' '
separator
) c_splitter.split_text(text3)
Try your own examples!
29.1 Recursive splitting details
RecursiveCharacterTextSplitter
is recommended for generic text.
= """When writing documents, writers will use document structure to group content. \
some_text This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n \
Paragraphs are often delimited with a carriage return or two c arriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""
len(some_text)
497
= CharacterTextSplitter(
c_splitter =450,
chunk_size=0,
chunk_overlap= ' '
separator
)= RecursiveCharacterTextSplitter(
r_splitter =450,
chunk_size=0,
chunk_overlap=["\n\n", "\n", " ", ""]
separators )
c_splitter.split_text(some_text)
['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two c arriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
'have a space.and words are separated by space.']
r_splitter.split_text(some_text)
["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
'Paragraphs are often delimited with a carriage return or two c arriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']
Let’s reduce the chunk size a bit and add a period to our separators:
= RecursiveCharacterTextSplitter(
r_splitter =150,
chunk_size=0,
chunk_overlap=["\n\n", "\n", "\. ", " ", ""]
separators
) r_splitter.split_text(some_text)
<>:4: SyntaxWarning: invalid escape sequence '\.'
<>:4: SyntaxWarning: invalid escape sequence '\.'
/var/folders/70/7wmmf6t55cb84bfx9g1c1k1m0000gn/T/ipykernel_11137/158012430.py:4: SyntaxWarning: invalid escape sequence '\.'
separators=["\n\n", "\n", "\. ", " ", ""]
["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
'Paragraphs are often delimited with a carriage return or two c arriage returns. Carriage returns are the "backslash n" you see embedded in this',
'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']
= RecursiveCharacterTextSplitter(
r_splitter =150,
chunk_size=0,
chunk_overlap=["\n\n", "\n", "(?<=\. )", " ", ""]
separators
) r_splitter.split_text(some_text)
<>:4: SyntaxWarning: invalid escape sequence '\.'
<>:4: SyntaxWarning: invalid escape sequence '\.'
/var/folders/70/7wmmf6t55cb84bfx9g1c1k1m0000gn/T/ipykernel_11137/2945222903.py:4: SyntaxWarning: invalid escape sequence '\.'
separators=["\n\n", "\n", "(?<=\. )", " ", ""]
["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
'Paragraphs are often delimited with a carriage return or two c arriage returns. Carriage returns are the "backslash n" you see embedded in this',
'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']
from langchain.document_loaders import PyPDFLoader
= PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf")
loader = loader.load() pages
from langchain.text_splitter import CharacterTextSplitter
= CharacterTextSplitter(
text_splitter ="\n",
separator=1000,
chunk_size=150,
chunk_overlap=len
length_function )
= text_splitter.split_documents(pages) docs
len(docs)
len(pages)
from langchain.document_loaders import NotionDirectoryLoader
= NotionDirectoryLoader("docs/Notion_DB")
loader = loader.load() notion_db
= text_splitter.split_documents(notion_db) docs
len(notion_db)
len(docs)
29.2 Token splitting
We can also split on token count explicity, if we want.
This can be useful because LLMs often have context windows designated in tokens.
Tokens are often ~4 characters.
from langchain.text_splitter import TokenTextSplitter
= TokenTextSplitter(chunk_size=1, chunk_overlap=0) text_splitter
= "foo bar bazzyfoo" text1
text_splitter.split_text(text1)
= TokenTextSplitter(chunk_size=10, chunk_overlap=0) text_splitter
= text_splitter.split_documents(pages) docs
0] docs[
0].metadata pages[
29.3 Context aware splitting
Chunking aims to keep text with common context together.
A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.
We can use MarkdownHeaderTextSplitter
to preserve header metadata in our chunks, as show below.
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
= """# Title\n\n \
markdown_document ## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n
## Chapter 2\n\n \
Hi this is Molly"""
= [
headers_to_split_on "#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
( ]
= MarkdownHeaderTextSplitter(
markdown_splitter =headers_to_split_on
headers_to_split_on
)= markdown_splitter.split_text(markdown_document) md_header_splits
md_header_splits
[Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim \nHi this is Joe'),
Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}, page_content='Hi this is Lance'),
Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 2'}, page_content='Hi this is Molly')]
0] md_header_splits[
Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim \nHi this is Joe')
1] md_header_splits[
Try on a real Markdown file, like a Notion database.
= NotionDirectoryLoader("docs/Notion_DB")
loader = loader.load()
docs = ' '.join([d.page_content for d in docs]) txt
= [
headers_to_split_on "#", "Header 1"),
("##", "Header 2"),
(
]= MarkdownHeaderTextSplitter(
markdown_splitter =headers_to_split_on
headers_to_split_on )
= markdown_splitter.split_text(txt) md_header_splits
0] md_header_splits[