!pip install -qU llama-index trulens_eval sentence_transformers

WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv


import os
from dotenv import load_dotenv,find_dotenv

# # Load OPENAI_API_KEY from local .env file
# load_dotenv(find_dotenv())

# Or set it like this
os.environ["OPENAI_API_KEY"] = "sk-..."

## Print key to check
# print(os.environ["OPENAI_API_KEY"])


!mkdir -p 'data'
!wget 'https://raw.githubusercontent.com/dbredvick/paul-graham-to-kindle/main/paul_graham_essays.txt' -O 'data/paul_graham_essays.txt'

--2024-04-12 16:45:50--  https://raw.githubusercontent.com/dbredvick/paul-graham-to-kindle/main/paul_graham_essays.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3075911 (2.9M) [text/plain]
Saving to: ‘data/paul_graham_essays.txt’

data/paul_graham_es 100%[===================>]   2.93M  --.-KB/s    in 0.07s   

2024-04-12 16:45:50 (43.8 MB/s) - ‘data/paul_graham_essays.txt’ saved [3075911/3075911]


from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./data/paul_graham_essays.txt"]
).load_data()


print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

1 

<class 'llama_index.core.schema.Document'>
Doc ID: 3a0e8c5c-4d7d-462b-b6a9-0ba264b9c2ac
Text: # RSS  [](index.html)             Aaron Swartz created a scraped
[feed](http://www.aaronsw.com/2002/feeds/pgessays.rss) of the essays
page.                    * * *    # This Year We Can End the Death
Penalty in California  [](index.html)             November 2016
If you're a California voter, there is an important proposition on
your ballo...


# Combine list of docs into a single doc to improve chunking performance
from llama_index.core import Document
document = Document(text="\n\n".join([doc.text for doc in documents]))


# If using multiple docs, combine into a single doc to improve chunking performance
# Here we also use a Document object, which stores the text along with `metadata`
# and `relationships` with other Documents/Nodes.

from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))


from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
Settings.embed_model = OpenAIEmbedding()


from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents([document])


query_engine = index.as_query_engine()


response = query_engine.query(
    "What's the difference between a maker's schedule and a manager's schedule?"
)
print(str(response))

The difference between a maker's schedule and a manager's schedule lies in how time is structured and utilized. The manager's schedule is divided into one-hour intervals, allowing for frequent task changes and meetings. In contrast, the maker's schedule involves working in larger blocks of time, such as half a day, to focus on creative tasks like programming or writing without interruptions. Meetings are disruptive for those on the maker's schedule as they break the flow of work, while they are more manageable for those on the manager's schedule who are accustomed to frequent task switching.


eval_questions = ["What's a good way for someone with kids to fund a startup?",
                  "What are the ten reasons why the US has such a high concentration of startups?",
                  "What are some ways to avoid copying the wrong thing?",
                  "What is 'good procrastination' according to Paul Graham?",
                  "What are the 18 reasons startups fail according to Paul Graham?",
                  "What are the six principles for making new things?",
                  "What is ramen profitable?",
                  "Who are Paul Graham's top founders?",
                  "Which VC investor famously passed on AirBnB after a mutual introduction by Paul Graham?",
                  "What are the things should founders do that don't scale?"]


# Write questions to a file
with open('eval_questions.txt', 'w') as file:
    for item in eval_questions:
        file.write("%s\n" % item)


from trulens_eval import Tru
tru = Tru()

tru.reset_database()

/usr/local/lib/python3.10/dist-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.
  warnings.warn("Setuptools is replacing distutils.")
[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt.zip.

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


from trulens_eval import (
    Feedback,
    TruLlama,
    OpenAI
)

from trulens_eval.feedback import Groundedness

import numpy as np

openai = OpenAI()

qa_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

qs_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

grounded = Groundedness(groundedness_provider=openai)

groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text)
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)

feedbacks = [qa_relevance, qs_relevance, groundedness]

def get_prebuilt_trulens_recorder(query_engine, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
        )
    return tru_recorder

tru_recorder = get_prebuilt_trulens_recorder(query_engine,
                                             app_id="Direct Query Engine")

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/20 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/12 [00:00<?, ?it/s]


records, feedback = tru.get_records_and_feedback(app_ids=[])


records.head()

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]


# Launches on http://localhost:8501/
# Where it asks for password, paste the IP address below.
tru.run_dashboard()

Starting dashboard ...
npx: installed 22 in 5.505s

Go to this url and submit the ip given here. your url is: https://fresh-women-decide.loca.lt

  Submit this IP Address: 34.133.161.169

<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>


from llama_index.core.node_parser import HierarchicalNodeParser

from llama_index.core.node_parser import get_leaf_nodes
from llama_index.core import StorageContext
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.query_engine import RetrieverQueryEngine

from llama_index.core import load_index_from_storage


def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    Settings.llm = llm
    Settings.embed_model = embed_model
    Settings.node_parser = node_parser

    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context)
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir)
        )
    return automerging_index

automerging_index = build_automerging_index(
    documents,
    llm=Settings.llm,
    embed_model=Settings.embed_model,
    save_dir="merging_index"
)


def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=2,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

automerging_query_engine = get_automerging_query_engine(
    automerging_index,
)


auto_merging_response = automerging_query_engine.query(
    "What's the difference between a maker's schedule and a manager's schedule?"
)
print(str(auto_merging_response))

> Merging 5 nodes into parent node.
> Parent node id: 3a6d5440-29ce-4cbd-94e9-fd475eac7acc.
> Parent node text: # Maker's Schedule, Manager's Schedule 

[](index.html)  
  

  
  

"...the mere consciousness o...

> Merging 4 nodes into parent node.
> Parent node id: 519e24b1-3f72-4eaf-bc30-7d13be319a5b.
> Parent node text: I know this may sound oversensitive, but if you're a maker, think of your own case. Don't your sp...

> Merging 3 nodes into parent node.
> Parent node id: d0f27003-7ed9-4d88-bfcb-e302849970e5.
> Parent node text: I used to program from dinner till about 3 am every day, because at night no one could interrupt ...

> Merging 3 nodes into parent node.
> Parent node id: 371f7d2b-77ba-4736-810e-75cc7f59c431.
> Parent node text: # Maker's Schedule, Manager's Schedule 

[](index.html)  
  

  
  

"...the mere consciousness o...

The difference between a maker's schedule and a manager's schedule lies in how time is utilized. The manager's schedule is divided into one-hour intervals, allowing for frequent task changes and meetings. On the other hand, the maker's schedule prefers larger units of time, like half a day, as it is more conducive for tasks that require deep focus and concentration, such as programming or writing. Meetings disrupt the maker's schedule significantly, breaking up the day into smaller, less productive segments, while they are more manageable on the manager's schedule.


tru.reset_database()

tru_recorder_automerging = get_prebuilt_trulens_recorder(automerging_query_engine,
                                                         app_id="Automerging Query Engine")


for question in eval_questions:
    with tru_recorder_automerging as recording:
        response = automerging_query_engine.query(question)
        print(question)
        print(response)

What's a good way for someone with kids to fund a startup?
A good way for someone with kids to fund a startup is to get a consulting project job where they can build the software they intend to sell as a startup. By gradually transitioning from a consulting company to a product company and having clients pay for development expenses, this approach can help reduce the risk associated with starting a startup.

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

> Merging 1 nodes into parent node.
> Parent node id: a111bfa6-3962-467e-ae64-b55a3da7a209.
> Parent node text: Each thinks "I can't let my friends down." This is one of the most powerful forces in human natur...

> Merging 1 nodes into parent node.
> Parent node id: 7a9415dc-c4b2-48f4-b909-8c31c5dc8abb.
> Parent node text: Observation confirms this too: cities either have a startup scene, or they don't. There is no mid...

> Merging 1 nodes into parent node.
> Parent node id: 36b43357-c167-446f-92fb-054be972ccab.
> Parent node text: The second is that thanks to things like Kickstarter, a startup can get to revenue faster. You ca...

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

What are the ten reasons why the US has such a high concentration of startups?
Startups are easier to start in America because funding is easier to get. There are now a few VC firms outside the US, but startup funding doesn't only come from VC firms. A more important source, because it's more personal and comes earlier in the process, is money from individual angel investors.

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

What are some ways to avoid copying the wrong thing?
Copy only what you genuinely like.

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]


tru.get_leaderboard(app_ids=[])


# If you need to find the loca.lt tunnel URL, fist stop then re-run the tru dashbard
tru.stop_dashboard()


# launches on http://localhost:8501/
# Copy and paste the IP address below in the "Tunnel Password" field at the URL below.
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
npx: installed 22 in 6.117s

Go to this url and submit the ip given here. your url is: https://sour-animals-wait.loca.lt

  Submit this IP Address: 34.133.161.169

<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

	app_id	app_json	type	record_id	input	output	tags	record_json	cost_json	perf_json	ts	Answer Relevance	Context Relevance	Groundedness	Answer Relevance_calls	Context Relevance_calls	Groundedness_calls	latency	total_tokens	total_cost
0	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_e01842e9f18a63860ab3e942c959a0ac	"What's a good way for someone with kids to fu...	"A good way for someone with kids to fund a st...	-	{"record_id": "record_hash_e01842e9f18a63860ab...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-12T16:49:29.502010", "...	2024-04-12T16:49:34.430179	0.9	0.60	0.333333	[{'args': {'prompt': 'What's a good way for so...	[{'args': {'prompt': 'What's a good way for so...	[{'args': {'source': 'Friends and Family ...	4	2175	0.003284
1	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_5f217eba75c9e28ab35af3acd2bf556c	"What are the ten reasons why the US has such ...	"1. The US allows immigration, attracting smar...	-	{"record_id": "record_hash_5f217eba75c9e28ab35...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-12T16:49:35.289369", "...	2024-04-12T16:49:41.234482	1.0	0.85	NaN	[{'args': {'prompt': 'What are the ten reasons...	[{'args': {'prompt': 'What are the ten reasons...	NaN	5	2259	0.003446
2	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_90fdcaf45a20a1370a8dcfe1d7261588	"What are some ways to avoid copying the wrong...	"Avoid copying the wrong thing by copying only...	-	{"record_id": "record_hash_90fdcaf45a20a1370a8...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-12T16:49:41.918192", "...	2024-04-12T16:49:46.774932	0.8	1.00	NaN	[{'args': {'prompt': 'What are some ways to av...	[{'args': {'prompt': 'What are some ways to av...	NaN	4	2133	0.003211
3	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_a7820dbf7eb248d9a65812ec3826f3cc	"What is 'good procrastination' according to P...	"Good procrastination, according to Paul Graha...	-	{"record_id": "record_hash_a7820dbf7eb248d9a65...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-12T16:49:47.550736", "...	2024-04-12T16:49:51.383566	1.0	1.00	1.000000	[{'args': {'prompt': 'What is 'good procrastin...	[{'args': {'prompt': 'What is 'good procrastin...	[{'args': {'source': 'To the extent this means...	3	2123	0.003178
4	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_5e1074c0f06f88674bafaf9402c14405	"What are the 18 reasons startups fail accordi...	"There are not 18 specific reasons listed in t...	-	{"record_id": "record_hash_5e1074c0f06f88674ba...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-12T16:49:52.125718", "...	2024-04-12T16:49:57.669949	0.8	0.10	0.500000	[{'args': {'prompt': 'What are the 18 reasons ...	[{'args': {'prompt': 'What are the 18 reasons ...	[{'args': {'source': '\[[13](#f13n)\] If ...	5	2038	0.003062

	Groundedness	Context Relevance	Answer Relevance	latency	total_cost
app_id
Automerging Query Engine	0.655	0.545	0.73	21.9	0.00067

Technique	Total Cost ($)	Total Tokens	Context Relevance	Answer Relevance	Groundedness
Automerging Query Engine	0.01	4.56k	0.58	0.73	0.6
Direct Query Engine	0.03	21.15k	0.67	0.66	0.55
Sentence Window Retrieval	0.03	20.48k	0.63	0.81	0.76

Auto-merging Retrieval for Context Efficiency¶

Attribution¶

Why should you read this notebook?¶

Motivation & Main Idea¶

Summary of Results¶

Set up¶

Install dependencies¶

Load data¶

Basic RAG pipeline¶

Configure embedding model and LLM¶

Create index and query engine¶

Run test query¶

Evaluation setup¶

RAG Triad¶

Set up TruLens evals¶

Advanced RAG pipeline¶

Auto-merging retrieval¶

Results¶