基于 llamaindex 探究 graphrag 原理

本文利用 llamaindex 的 KnowledgeGraphIndex 非结构化文档的知识图谱,并使用该知识图谱回答问题

环境设置

设置推理模型及 embedding 模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
base_url='http://192.168.3.165:11434'
Settings.llm = Ollama(model="qwen2.5:latest", request_timeout=360.0,base_url=base_url)
Settings.embed_model = OllamaEmbedding(model_name="quentinz/bge-large-zh-v1.5:latest",base_url=base_url)
Settings.chunk_size = 512

# 可视化知识库函数
def show_knowledge_graph(index,name='example.html'):
from pyvis.network import Network

g = index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.show(name)

构建知识图谱

首先,使用 SimpleDirectoryReaderd 读取非结构化文档

1
2
3
4
5
6
7
# 1.自定义documents
from llama_index.core.schema import TextNode,Document
documents=[
Document(id='123',text='阴雨绵绵的清晨,李华见徐天行老师在校门口焦急张望,立刻推出自己的新自行车:“老师,我送您去教室!”放学后,伍娟摸着儿子湿透的校服眼眶发红,李山拍拍儿子肩膀:“懂得承担责任,是男子汉了。'),
Document(id='456',text='徐天行举着李华的作业本:“横平竖直,一夜进步神速!”原来李华熬夜练习书法,只因徐老师曾说“字如心正”。'),
Document(id='789',tex='张志抱着一摞昆虫图鉴冲进教室:“生物课展示靠你了!”李华苦笑——他最怕甲虫,但想起徐老师“直面恐惧”的鼓励,咬牙点头。两人熬夜制作模型,李山帮忙3D打印鞘翅,伍娟端来牛奶。次日展示时,徐老师按下录音键:“这份协作,值得全班倾听。” 。')
]

其次利用 KnowledgeGraphIndex 在全局知识层面构建知识图谱 (也就是索引)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from llama_index.core import SimpleDirectoryReader,StorageContext, KnowledgeGraphIndex
from llama_index.core.graph_stores import SimpleGraphStore

import os
save_kg_path=r'db\example\graph.db'
if os.path.exists(save_kg_path):
graph_store = SimpleGraphStore().from_persist_dir(save_kg_path)

index = KnowledgeGraphIndex.from_documents(
documents,
max_triplets_per_chunk=50,
graph_store=graph_store,
show_progress=True
)
else:
graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

index = KnowledgeGraphIndex.from_documents(
documents,
max_triplets_per_chunk=50,
storage_context=storage_context,
show_progress=True
)
index.storage_context.persist(persist_dir=r'db\example',graph_store_fname='graph.db')
Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]
Processing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

最后,使用 pyvis 工具查看知识图谱的内容

1
show_knowledge_graph(index,name='example.html')

alt text

由可视化结果看出,使用 llm 构建的知识图谱,将中文名改为了英文,而且感觉图谱质量不高。

实际上,知识图谱的构建本质是 N 个三元组 (实体 1,实体 2,关系) 组成,所以也可以直接向索引对象添加三元组,实现 “手动” 知识图谱的构建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 2.文档chunk化
from llama_index.core.node_parser import SentenceSplitter
node_parser=SentenceSplitter(chunk_size=512,chunk_overlap=32)
nodes=node_parser.get_nodes_from_documents(documents)

# 3.构建索引
from llama_index.core import KnowledgeGraphIndex
index = KnowledgeGraphIndex(
[],
)

# 针对第1个文档的关系
node_tups=[('李华','母亲是','伍娟'),('李华','父亲是','李山')]
for node_tup in node_tups:
index.upsert_triplet_and_node(node_tup,nodes[0],include_embeddings=False)

# 针对第1个文档的关系
node_tups=[('李华','同学是','张志')]
for node_tup in node_tups:
index.upsert_triplet_and_node(node_tup,nodes[1],include_embeddings=False)

# 针对第3个文档的关系
node_tups=[('张志','老师是','徐天行')]
for node_tup in node_tups:
index.upsert_triplet_and_node(node_tup,nodes[2],include_embeddings=False)

show_knowledge_graph(index,name='custom_example.html')

alt text

使用知识图谱

1
2
# 直接使用索引,分析文本中的三元组
index._extract_triplets("李华和王五去徐天行老师家补课")
[('李华', '去', '徐天行老师家'), ('王五', '去', '徐天行老师家'), ('补课', '发生在', '徐天行老师家')]
1
2
3
4
5
6
7
# 索引->检索器
from llama_index.core.indices.knowledge_graph.retrievers import KGRetrieverMode
retriever=index.as_retriever(retriever_mode=KGRetrieverMode.KEYWORD)

from llama_index.core.schema import QueryBundle
query=QueryBundle(query_str="李华和同学张志一起完成了什么任务?")
retriever._retrieve(query)
1
2
3
4
[NodeWithScore(node=TextNode(id_='d38689d8-a7c2-41f8-a8de-d79c84bc4034', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='97ee7e69-7ebf-4343-a587-8ee10e281cae', node_type='4', metadata={}, hash='96f4e9c8f81325976a25213572d9255024347d55bce55e980bc6d1ec03bd33f7')}, metadata_template='{key}: {value}', metadata_separator='\n', text='徐天行举着李华的作业本:“横平竖直,一夜进步神速!”原来李华熬夜练习书法,只因徐老师曾说“字如心正”。', mimetype='text/plain', start_char_idx=0, end_char_idx=51, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0),
NodeWithScore(node=TextNode(id_='3a354c0b-3051-41ae-80f0-089627928e8c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a3aa4ae9-40e0-4303-b800-4a88d4f90b54', node_type='4', metadata={}, hash='44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a')}, metadata_template='{key}: {value}', metadata_separator='\n', text='', mimetype='text/plain', start_char_idx=0, end_char_idx=0, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0),
NodeWithScore(node=TextNode(id_='132f96dc-4d68-4313-ad64-d3a19f33581b', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='822127f3-1511-4a31-921b-c527fd0f3a15', node_type='4', metadata={}, hash='904fbb3a3e01cf71a67cd33a5dafe1966fade6f3c538516a59823b2e78041b62')}, metadata_template='{key}: {value}', metadata_separator='\n', text='阴雨绵绵的清晨,李华见徐天行老师在校门口焦急张望,立刻推出自己的新自行车:“老师,我送您去教室!”放学后,伍娟摸着儿子湿透的校服眼眶发红,李山拍拍儿子肩膀:“懂得承担责任,是男子汉了。', mimetype='text/plain', start_char_idx=0, end_char_idx=92, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0),
NodeWithScore(node=TextNode(id_='64566ef0-e951-4512-86d3-b3c5dcf17f8a', embedding=None, metadata={'kg_rel_texts': ["['张志', '老师是', '徐天行']", "['李华', '母亲是', '伍娟']", "['李华', '父亲是', '李山']", "['李华', '同学是', '张志']", "['张志', '老师是', '徐天行']"], 'kg_rel_map': {'任务': [], '完成': [], '同学': [], '张志': [['张志', '老师是', '徐天行']], '李华': [['李华', '母亲是', '伍娟'], ['李华', '父亲是', '李山'], ['李华', '同学是', '张志'], ['张志', '老师是', '徐天行']]}}, excluded_embed_metadata_keys=['kg_rel_map', 'kg_rel_texts'], excluded_llm_metadata_keys=['kg_rel_map', 'kg_rel_texts'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text="The following are knowledge sequence in max depth 2 in the form of directed graph like:\n`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`\n['张志', '老师是', '徐天行']\n['李华', '母亲是', '伍娟']\n['李华', '父亲是', '李山']\n['李华', '同学是', '张志']\n['张志', '老师是', '徐天行']", mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0)]
1
2
3
4
5
6
7
8
9
10
# 索引->RAG引擎
query_engine = index.as_query_engine(
include_text=True, response_mode="tree_summarize",similarity_top_k=3
)
response = query_engine.query(
"老师如何评价李华和张志的课程作业",
)

from IPython.display import Markdown
display(Markdown(f"<b>{response}</b>"))

徐天行对李华的书法进步给予了积极评价,说他的字 “横平竖直,一夜进步神速”。至于张志的情况,文中并未提及具体课程作业的评价信息。

后面两个应用中,可以发现知识图谱并没有准确检索到文档,或者准确回答问题,这是因为只使用了 “实体(关键字)” 去知识图谱检索文档,下面我们在知识图谱的基础上添加文档的整体语义信息,检索时,同时考虑实体及语义,看看其检索与回答效果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
index = KnowledgeGraphIndex(
[],
)

# 针对第1个文档的关系
node_tups=[('李华','母亲是','伍娟'),('李华','父亲是','李山')]
for node_tup in node_tups:
index.upsert_triplet_and_node(node_tup,nodes[0],include_embeddings=True)

# 针对第1个文档的关系
node_tups=[('李华','同学是','张志')]
for node_tup in node_tups:
index.upsert_triplet_and_node(node_tup,nodes[1],include_embeddings=True)

# 针对第3个文档的关系
node_tups=[('张志','老师是','徐天行')]
for node_tup in node_tups:
index.upsert_triplet_and_node(node_tup,nodes[2],include_embeddings=True)

# 索引->检索器
from llama_index.core.indices.knowledge_graph.retrievers import KGRetrieverMode

retriever=index.as_retriever(retriever_mode=KGRetrieverMode.HYBRID)

from llama_index.core.schema import QueryBundle
query=QueryBundle(query_str="老师如何评价李华和张志的课程作业")
retriever._retrieve(query)
1
2
3
4
[NodeWithScore(node=TextNode(id_='d38689d8-a7c2-41f8-a8de-d79c84bc4034', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='97ee7e69-7ebf-4343-a587-8ee10e281cae', node_type='4', metadata={}, hash='96f4e9c8f81325976a25213572d9255024347d55bce55e980bc6d1ec03bd33f7')}, metadata_template='{key}: {value}', metadata_separator='\n', text='徐天行举着李华的作业本:“横平竖直,一夜进步神速!”原来李华熬夜练习书法,只因徐老师曾说“字如心正”。', mimetype='text/plain', start_char_idx=0, end_char_idx=51, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0),
NodeWithScore(node=TextNode(id_='3a354c0b-3051-41ae-80f0-089627928e8c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a3aa4ae9-40e0-4303-b800-4a88d4f90b54', node_type='4', metadata={}, hash='44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a')}, metadata_template='{key}: {value}', metadata_separator='\n', text='', mimetype='text/plain', start_char_idx=0, end_char_idx=0, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0),
NodeWithScore(node=TextNode(id_='132f96dc-4d68-4313-ad64-d3a19f33581b', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='822127f3-1511-4a31-921b-c527fd0f3a15', node_type='4', metadata={}, hash='904fbb3a3e01cf71a67cd33a5dafe1966fade6f3c538516a59823b2e78041b62')}, metadata_template='{key}: {value}', metadata_separator='\n', text='阴雨绵绵的清晨,李华见徐天行老师在校门口焦急张望,立刻推出自己的新自行车:“老师,我送您去教室!”放学后,伍娟摸着儿子湿透的校服眼眶发红,李山拍拍儿子肩膀:“懂得承担责任,是男子汉了。', mimetype='text/plain', start_char_idx=0, end_char_idx=92, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0),
NodeWithScore(node=TextNode(id_='f730db7b-3fe4-4133-b749-2872dee27412', embedding=None, metadata={'kg_rel_texts': ["('张志', '老师是', '徐天行')", "['张志', '老师是', '徐天行']", "('李华', '同学是', '张志')", "['李华', '母亲是', '伍娟']", "['李华', '同学是', '张志']", "['李华', '父亲是', '李山']"], 'kg_rel_map': {'课程作业': [], '老师': [], '李华': [['李华', '母亲是', '伍娟'], ['李华', '父亲是', '李山'], ['李华', '同学是', '张志'], ['张志', '老师是', '徐天行']], '张志': [['张志', '老师是', '徐天行']], '评价': []}}, excluded_embed_metadata_keys=['kg_rel_map', 'kg_rel_texts'], excluded_llm_metadata_keys=['kg_rel_map', 'kg_rel_texts'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text="The following are knowledge sequence in max depth 2 in the form of directed graph like:\n`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`\n('张志', '老师是', '徐天行')\n['张志', '老师是', '徐天行']\n('李华', '同学是', '张志')\n['李华', '母亲是', '伍娟']\n['李华', '同学是', '张志']\n['李华', '父亲是', '李山']", mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=1000.0)]
1
2
3
4
5
6
7
8
9
10
# 索引->RAG引擎
query_engine = index.as_query_engine(
include_text=True, response_mode="refine"
)
response = query_engine.query(
"老师如何评价李华和张志的生物课程作业",
)

from IPython.display import Markdown
display(Markdown(f"<b>{response}</b>"))

根据新提供的信息,我们仍然没有关于老师对李华和张志生物课程作业的具体评价。文中仅描述了徐天行是张志的老师以及与李华相关的一些家庭成员关系,并未提及任何有关生物课程作业的内容。因此,无法回答关于他们的生物课程作业情况。

可以发现,检测、回答效果并没有改善,看来 GraphRAG 并不是在所有场景有效,或者是其效果依赖于知识图谱的质量,在本脚本中,知识图谱太小,使用受限

构建知识图谱的原理

知识图谱的目的是使用 llm 提取文本的三元组,我们首先来看看 llm 的相关 prompt

1
2
3
4
5
from llama_index.core import KnowledgeGraphIndex

index=KnowledgeGraphIndex([])
index.kg_triplet_extract_template

PromptTemplate(metadata={‘prompt_type’: <PromptType.KNOWLEDGE_TRIPLET_EXTRACT: ‘knowledge_triplet_extract’>}, template_vars=[‘max_knowledge_triplets’, ‘text’], kwargs={‘max_knowledge_triplets’: 10}, output_parser=None, template_var_mappings=None, function_mappings=None, template=“Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.\n---------------------\nExample:Text: Alice is Bob’s mother.Triplets:\n(Alice, is mother of, Bob)\nText: Philz is a coffee shop founded in Berkeley in 1982.\nTriplets:\n(Philz, is, coffee shop)\n(Philz, founded in, Berkeley)\n(Philz, founded in, 1982)\n---------------------\nText: {text}\nTriplets:\n”)

其中 prompt 内容如下:

1
2
3
4
5
6
7
8
9
10
Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.\n
---------------------\n
Example:Text: Alice is Bob's mother.Triplets:\n(Alice, is mother of, Bob)\n
Text: Philz is a coffee shop founded in Berkeley in 1982.\n
Triplets:\n(Philz, is, coffee shop)\n
(Philz, founded in, Berkeley)\n
(Philz, founded in, 1982)\n
---------------------\n
Text: {text}\n
Triplets:\n

内容显示,要求 llm 提供数量为 max_knowledge_triplets 个的 (subject, predicate, object) s 三元组,并且给出提示例子

应用小例子

1
2
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./三国演义白话文/",recursive=True).load_data(show_progress=True)
Loading files: 100%|██████████| 8/8 [00:00<00:00, 74.28file/s]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from llama_index.core import StorageContext, KnowledgeGraphIndex
from llama_index.core.graph_stores import SimpleGraphStore

import os
save_kg_path=r'db\sanguo\graph.db'
if os.path.exists(save_kg_path):
graph_store = SimpleGraphStore().from_persist_dir(save_kg_path)

index = KnowledgeGraphIndex.from_documents(
documents,
max_triplets_per_chunk=50,
graph_store=graph_store,
show_progress=True
)
else:
graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

index = KnowledgeGraphIndex.from_documents(
documents,
max_triplets_per_chunk=50,
storage_context=storage_context,
show_progress=True
)
index.storage_context.persist(persist_dir=r'db\sanguo',graph_store_fname='graph.db')
Parsing nodes:   0%|          | 0/8 [00:00<?, ?it/s]
Processing nodes:   0%|          | 0/80 [00:00<?, ?it/s]
1
2
3
4
5
6
7
8
9
query_engine = index.as_query_engine(
include_text=False, response_mode="tree_summarize"
)
response = query_engine.query(
"董卓如何将吕布收为义子的?",
)

from IPython.display import Markdown
display(Markdown(f"<b>{response}</b>"))

董卓通过给予礼物的方式收买了吕布,将他收为了义子。根据信息,董卓用一匹名为 “赤兔马” 的千里马和数不尽的黄金珠宝作为礼物送给了吕布,成功地让他拜自己为义父。

1
2
3
4
5
6
7
8
query_engine = index.as_query_engine(
include_text=True, response_mode="tree_summarize"
)
response = query_engine.query(
"刘备如何为朝廷效力?",
)

display(Markdown(f"<b>{response}</b>"))

根据提供的信息,刘备最初是通过军事才能为朝廷效力的。在故事中提到,关羽和张飞在战场上英勇无敌,而刘备则既有勇又有谋,三人接连取胜,战功赫赫。但是,随着时间推移,朝廷变得更加腐败,实行卖官鬻爵的行为。尽管如此,刘备并未放弃为国家贡献力量的机会,在担任安喜县县尉期间,他与百姓秋毫无犯,深得民心,展现了其忠于职守的一面。

总结:本文使用 llamaindex 学习了知识图谱的构建,并基于知识图谱创建 RAG 应用,发现 GraphRAG 的回答质量不是在所有场景适用,而是依赖于高质量的知识图谱