import pandas as pd
from langchain_openai import ChatOpenAI
10 How to Q&A over CSVs
- CSV data: https://python.langchain.com/v0.2/docs/how_to/sql_csv/
- SQL data: https://python.langchain.com/v0.2/docs/tutorials/sql_qa/
10.1 LLM
= ChatOpenAI(model="gpt-3.5-turbo-0125") llm
10.2 Read CSV
= pd.read_csv("data/Titanic-Dataset.csv")
df print(df.shape)
print(df.columns.tolist())
(891, 12)
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
10.3 SQL
import os
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine
if not os.path.exists("titanic.db"):
= create_engine("sqlite:///titanic.db")
engine "titanic", engine, index=False) df.to_sql(
= SQLDatabase(engine=engine)
db print(db.dialect)
sqlite
print(db.get_usable_table_names())
print(db.run("SELECT * FROM titanic WHERE Age < 2;"))
['titanic']
[(79, 1, 2, 'Caldwell, Master. Alden Gates', 'male', 0.83, 0, 2, '248738', 29.0, None, 'S'), (165, 0, 3, 'Panula, Master. Eino Viljami', 'male', 1.0, 4, 1, '3101295', 39.6875, None, 'S'), (173, 1, 3, 'Johnson, Miss. Eleanor Ileen', 'female', 1.0, 1, 1, '347742', 11.1333, None, 'S'), (184, 1, 2, 'Becker, Master. Richard F', 'male', 1.0, 2, 1, '230136', 39.0, 'F4', 'S'), (306, 1, 1, 'Allison, Master. Hudson Trevor', 'male', 0.92, 1, 2, '113781', 151.55, 'C22 C26', 'S'), (382, 1, 3, 'Nakid, Miss. Maria ("Mary")', 'female', 1.0, 0, 2, '2653', 15.7417, None, 'C'), (387, 0, 3, 'Goodwin, Master. Sidney Leonard', 'male', 1.0, 5, 2, 'CA 2144', 46.9, None, 'S'), (470, 1, 3, 'Baclini, Miss. Helene Barbara', 'female', 0.75, 2, 1, '2666', 19.2583, None, 'C'), (645, 1, 3, 'Baclini, Miss. Eugenie', 'female', 0.75, 2, 1, '2666', 19.2583, None, 'C'), (756, 1, 2, 'Hamalainen, Master. Viljo', 'male', 0.67, 1, 1, '250649', 14.5, None, 'S'), (789, 1, 3, 'Dean, Master. Bertram Vere', 'male', 1.0, 1, 2, 'C.A. 2315', 20.575, None, 'S'), (804, 1, 3, 'Thomas, Master. Assad Alexander', 'male', 0.42, 0, 1, '2625', 8.5167, None, 'C'), (828, 1, 2, 'Mallet, Master. Andre', 'male', 1.0, 0, 2, 'S.C./PARIS 2079', 37.0042, None, 'C'), (832, 1, 2, 'Richards, Master. George Sibley', 'male', 0.83, 1, 1, '29106', 18.75, None, 'S')]
10.4 SQL Agent
from langchain_community.agent_toolkits import create_sql_agent
= create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True) agent_executor
= agent_executor.invoke({"input": "what's the average age of survivors"})
res res
> Entering new SQL Agent Executor chain...
Invoking: `sql_db_list_tables` with `{}`
titanic
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`
CREATE TABLE titanic (
"PassengerId" BIGINT,
"Survived" BIGINT,
"Pclass" BIGINT,
"Name" TEXT,
"Sex" TEXT,
"Age" FLOAT,
"SibSp" BIGINT,
"Parch" BIGINT,
"Ticket" TEXT,
"Fare" FLOAT,
"Cabin" TEXT,
"Embarked" TEXT
)
/*
3 rows from titanic table:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.25 None S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.925 None S
*/
Invoking: `sql_db_query` with `{'query': 'SELECT AVG(Age) AS Average_Age FROM titanic WHERE Survived = 1'}`
[(28.343689655172415,)]The average age of survivors in the Titanic dataset is approximately 28.34 years.
> Finished chain.
{'input': "what's the average age of survivors",
'output': 'The average age of survivors in the Titanic dataset is approximately 28.34 years.'}
res
{'input': "what's the average age of survivors",
'output': 'The average age of survivors in the Titanic dataset is approximately 28.34 years.'}