更新时间:2024-10-16 GMT+08:00
分享

文档摘要

基于已有的知识库,进行摘要总结。有stuff、refine、map-reduce策略。

  • Stuff:将所有文档直接填充到prompt中,提给模型处理,适合文档较少的场景。
    from pangukitsappdev.api.embeddings.factory import Embeddings
    from pangukitsappdev.api.llms.factory import LLMs
    from pangukitsappdev.api.memory.vector.factory import Vectors
    from pangukitsappdev.api.memory.vector.vector_config import VectorStoreConfig, ServerInfoCss
    from pangukitsappdev.skill.doc.summary import DocSummaryStuffSkill
    vector_store_config = VectorStoreConfig(store_name="css",
                                            index_name="your_index_name",
                                            embedding=Embeddings.of("css"),
                                            text_key="name",
                                            vector_fields=["description"],
                                            distance_strategy="inner_product",
                                            server_info=ServerInfoCss(env_prefix="sdk.memory.css"))
    vector_api = Vectors.of("css", vector_store_config)
    
    # 检索
    query = "杜甫"
    docs = vector_api.similarity_search(query, 4)
    
    # 摘要
    doc_skill = DocSummaryStuffSkill(LLMs.of("pangu"))
    
    print(doc_skill.execute({"documents": docs}))
  • Refine:基于首个文档,并循环后续文档来迭代更新答案。
    from pangukitsappdev.api.embeddings.factory import Embeddings
    from pangukitsappdev.api.llms.factory import LLMs
    from pangukitsappdev.api.memory.vector.factory import Vectors
    from pangukitsappdev.api.memory.vector.vector_config import VectorStoreConfig, ServerInfoCss
    from pangukitsappdev.skill.doc.summary import DocSummaryRefineSkill
    vector_store_config = VectorStoreConfig(store_name="css",
                                            index_name="your_index_name",
                                            embedding=Embeddings.of("css"),
                                            text_key="name",
                                            vector_fields=["description"],
                                            distance_strategy="inner_product",
                                            server_info=ServerInfoCss(env_prefix="sdk.memory.css"))
    
    vector_api = Vectors.of("css", vector_store_config)
    
    # 检索
    query = "杜甫"
    docs = vector_api.similarity_search(query, 4)
    
    # 摘要
    doc_skill = DocSummaryRefineSkill(LLMs.of("pangu"))
    
    print(doc_skill.execute({"documents": docs}))
  • Map-Reduce:先将文档单独进行摘要, 将摘要后的文档再提交给模型。 必要时,会循环迭代摘要。
    from pangukitsappdev.api.embeddings.factory import Embeddings
    from pangukitsappdev.api.llms.factory import LLMs
    from pangukitsappdev.api.memory.vector.factory import Vectors
    from pangukitsappdev.api.memory.vector.vector_config import VectorStoreConfig, ServerInfoCss
    from pangukitsappdev.skill.doc.summary import DocSummaryMapReduceSkill
    vector_store_config = VectorStoreConfig(store_name="css",
                                            index_name="your_index_name",
                                            embedding=Embeddings.of("css"),
                                            text_key="name",
                                            vector_fields=["description"],
                                            distance_strategy="inner_product",
                                            server_info=ServerInfoCss(env_prefix="sdk.memory.css"))
    
    vector_api = Vectors.of("css", vector_store_config)
    
    # 检索
    query = "杜甫"
    docs = vector_api.similarity_search(query, 4)
    
    # 摘要
    doc_skill = DocSummaryMapReduceSkill(LLMs.of("pangu"))
    
    print(doc_skill.execute({"documents": docs}))

相关文档