Browse Source

【新增】AI 知识库: 配置自定义、段落启禁用

xiaoxin 10 months ago
parent
commit
9b8136ef30

+ 7 - 0
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java

@@ -25,4 +25,11 @@ public class AiKnowledgeCreateMyReqVO {
     @NotNull(message = "嵌入模型不能为空")
     private Long modelId;
 
+    @Schema(description = "相似性阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "0.5")
+    @NotNull(message = "相似性阈值不能为空")
+    private Double similarityThreshold;
+
+    @Schema(description = "topK", requiredMode = Schema.RequiredMode.REQUIRED, example = "3")
+    @NotNull(message = "topK 不能为空")
+    private Integer topK;
 }

+ 19 - 0
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java

@@ -23,4 +23,23 @@ public class AiKnowledgeDocumentCreateReqVO {
     @URL(message = "文档 URL 格式不正确")
     private String url;
 
+    @Schema(description = "每个文本块的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800")
+    @NotNull(message = "每个文本块的目标 token 数不能为空")
+    private Integer defaultChunkSize;
+
+    @Schema(description = "每个文本块的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350")
+    @NotNull(message = "每个文本块的最小字符数不能为空")
+    private Integer minChunkSizeChars;
+
+    @Schema(description = "丢弃阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "5")
+    @NotNull(message = "丢弃阈值不能为空")
+    private Integer minChunkLengthToEmbed;
+
+    @Schema(description = "最大块数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000")
+    @NotNull(message = "最大块数不能为空")
+    private Integer maxNumChunks;
+
+    @Schema(description = "分块是否保留分隔符", requiredMode = Schema.RequiredMode.REQUIRED, example = "true")
+    @NotNull(message = "分块是否保留分隔符不能为空")
+    private Boolean keepSeparator;
 }

+ 12 - 0
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java

@@ -52,6 +52,18 @@ public class AiKnowledgeDO extends BaseDO {
      * 模型标识
      */
     private String model;
+
+    /**
+     * topK
+     */
+    private Integer topK;
+
+    /**
+     * 相似度阈值
+     */
+    private Double similarityThreshold;
+
+
     /**
      * 状态
      * <p>

+ 21 - 1
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java

@@ -23,7 +23,7 @@ public class AiKnowledgeDocumentDO extends BaseDO {
     private Long id;
     /**
      * 知识库编号
-     *
+     * <p>
      * 关联 {@link AiKnowledgeDO#getId()}
      */
     private Long knowledgeId;
@@ -47,6 +47,26 @@ public class AiKnowledgeDocumentDO extends BaseDO {
      * 字符数
      */
     private Integer wordCount;
+    /**
+     * 每个文本块的目标 token 数
+     */
+    private Integer defaultChunkSize;
+    /**
+     * 每个文本块的最小字符数
+     */
+    private Integer minChunkSizeChars;
+    /**
+     * 低于此值的块会被丢弃
+     */
+    private Integer minChunkLengthToEmbed;
+    /**
+     * 最大块数
+     */
+    private Integer maxNumChunks;
+    /**
+     * 分块是否保留分隔符
+     */
+    private Boolean keepSeparator;
     /**
      * 切片状态
      * <p>

+ 6 - 3
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java

@@ -2,6 +2,8 @@ package cn.iocoder.yudao.module.ai.dal.dataobject.knowledge;
 
 import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum;
 import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
+import com.baomidou.mybatisplus.annotation.FieldStrategy;
+import com.baomidou.mybatisplus.annotation.TableField;
 import com.baomidou.mybatisplus.annotation.TableId;
 import com.baomidou.mybatisplus.annotation.TableName;
 import lombok.Data;
@@ -25,16 +27,17 @@ public class AiKnowledgeSegmentDO extends BaseDO {
     /**
      * 向量库的编号
      */
+    @TableField(updateStrategy = FieldStrategy.ALWAYS)
     private String vectorId;
     /**
      * 知识库编号
-     *
+     * <p>
      * 关联 {@link AiKnowledgeDO#getId()}
      */
     private Long knowledgeId;
     /**
      * 文档编号
-     *
+     * <p>
      * 关联 {@link AiKnowledgeDocumentDO#getId()}
      */
     private Long documentId;
@@ -52,7 +55,7 @@ public class AiKnowledgeSegmentDO extends BaseDO {
     private Integer tokens;
     /**
      * 状态
-     *
+     * <p>
      * 枚举 {@link CommonStatusEnum}
      */
     private Integer status;

+ 5 - 16
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java

@@ -9,15 +9,11 @@ import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
 import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentPageReqVO;
 import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentUpdateReqVO;
 import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeDocumentCreateReqVO;
-import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO;
 import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDocumentDO;
 import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO;
-import cn.iocoder.yudao.module.ai.dal.dataobject.model.AiChatModelDO;
 import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeDocumentMapper;
 import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeSegmentMapper;
 import cn.iocoder.yudao.module.ai.enums.knowledge.AiKnowledgeDocumentStatusEnum;
-import cn.iocoder.yudao.module.ai.service.model.AiApiKeyService;
-import cn.iocoder.yudao.module.ai.service.model.AiChatModelService;
 import jakarta.annotation.Resource;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.ai.document.Document;
@@ -48,24 +44,16 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
     @Resource
     private AiKnowledgeSegmentMapper segmentMapper;
 
-    @Resource
-    private TokenTextSplitter tokenTextSplitter;
     @Resource
     private TokenCountEstimator tokenCountEstimator;
-
-    @Resource
-    private AiApiKeyService apiKeyService;
     @Resource
     private AiKnowledgeService knowledgeService;
-    @Resource
-    private AiChatModelService chatModelService;
 
     @Override
     @Transactional(rollbackFor = Exception.class)
     public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) {
-        // 0. 校验
-        AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(createReqVO.getKnowledgeId());
-        AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId());
+        // 0. 校验并获取向量存储实例
+        VectorStore vectorStore = knowledgeService.getVectorStoreById(createReqVO.getKnowledgeId());
 
         // 1.1 下载文档
         TikaDocumentReader loader = new TikaDocumentReader(downloadFile(createReqVO.getUrl()));
@@ -82,6 +70,9 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
             return documentId;
         }
 
+        // 2 构造文本分段器
+        TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultChunkSize(), createReqVO.getMinChunkSizeChars(), createReqVO.getMinChunkLengthToEmbed(),
+                createReqVO.getMaxNumChunks(), createReqVO.getKeepSeparator());
         // 2.1 文档分段
         List<Document> segments = tokenTextSplitter.apply(documents);
         // 2.2 分段内容入库
@@ -92,8 +83,6 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
                         .setStatus(CommonStatusEnum.ENABLE.getStatus()));
         segmentMapper.insertBatch(segmentDOList);
 
-        // 3.1 获取向量存储实例
-        VectorStore vectorStore = apiKeyService.getOrCreateVectorStore(model.getKeyId());
         // 3.2 向量化并存储
         segments.forEach(segment -> segment.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, createReqVO.getKnowledgeId()));
         vectorStore.add(segments);

+ 57 - 7
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java

@@ -2,6 +2,7 @@ package cn.iocoder.yudao.module.ai.service.knowledge;
 
 import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
+import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum;
 import cn.iocoder.yudao.framework.common.pojo.PageResult;
 import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
 import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment.AiKnowledgeSegmentPageReqVO;
@@ -23,6 +24,10 @@ import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder;
 import org.springframework.stereotype.Service;
 
 import java.util.List;
+import java.util.Objects;
+
+import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
+import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_SEGMENT_NOT_EXISTS;
 
 /**
  * AI 知识库分片 Service 实现类
@@ -50,14 +55,45 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
 
     @Override
     public void updateKnowledgeSegment(AiKnowledgeSegmentUpdateReqVO reqVO) {
-        segmentMapper.updateById(BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class));
-        // TODO @xin 重新向量化
+        // 0 校验
+        AiKnowledgeSegmentDO oldKnowledgeSegment = validateKnowledgeSegmentExists(reqVO.getId());
+        // 2.1 获取知识库向量实例
+        VectorStore vectorStore = knowledgeService.getVectorStoreById(oldKnowledgeSegment.getKnowledgeId());
+        // 2.2 删除原向量
+        vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId()));
+
+        // 2.3 重新向量化
+        Document document = new Document(reqVO.getContent());
+        document.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, oldKnowledgeSegment.getKnowledgeId());
+        vectorStore.add(List.of(document));
+
+        // 2.1 更新段落内容
+        AiKnowledgeSegmentDO knowledgeSegment = BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class);
+        knowledgeSegment.setVectorId(document.getId());
+        segmentMapper.updateById(knowledgeSegment);
     }
 
     @Override
     public void updateKnowledgeSegmentStatus(AiKnowledgeSegmentUpdateStatusReqVO reqVO) {
-        segmentMapper.updateById(BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class));
-        // TODO @xin 1.禁用删除向量 2.启用重新向量化
+        // 0 校验
+        AiKnowledgeSegmentDO oldKnowledgeSegment = validateKnowledgeSegmentExists(reqVO.getId());
+        // 1 获取知识库向量实例
+        VectorStore vectorStore = knowledgeService.getVectorStoreById(oldKnowledgeSegment.getKnowledgeId());
+        AiKnowledgeSegmentDO knowledgeSegment = BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class);
+
+        if (Objects.equals(reqVO.getStatus(), CommonStatusEnum.ENABLE.getStatus())) {
+            // 2.1 启用重新向量化
+            Document document = new Document(oldKnowledgeSegment.getContent());
+            document.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, oldKnowledgeSegment.getKnowledgeId());
+            vectorStore.add(List.of(document));
+            knowledgeSegment.setVectorId(document.getId());
+        } else {
+            // 2.2 禁用删除向量
+            vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId()));
+            knowledgeSegment.setVectorId(null);
+        }
+        // 3 更新段落状态
+        segmentMapper.updateById(knowledgeSegment);
     }
 
     @Override
@@ -71,9 +107,8 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
 
         // 1.2 向量检索
         List<Document> documentList = vectorStore.similaritySearch(SearchRequest.query(reqVO.getContent())
-                //TODO  @xin 配置提取
-                .withTopK(5)
-                .withSimilarityThreshold(0.5d)
+                .withTopK(knowledge.getTopK())
+                .withSimilarityThreshold(knowledge.getSimilarityThreshold())
                 .withFilterExpression(new FilterExpressionBuilder().eq(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, reqVO.getKnowledgeId()).build()));
         if (CollUtil.isEmpty(documentList)) {
             return ListUtil.empty();
@@ -81,4 +116,19 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
         // 2.1 段落召回
         return segmentMapper.selectList(CollUtil.getFieldValues(documentList, "id", String.class));
     }
+
+
+    /**
+     * 校验段落是否存在
+     *
+     * @param id 文档编号
+     * @return 段落信息
+     */
+    private AiKnowledgeSegmentDO validateKnowledgeSegmentExists(Long id) {
+        AiKnowledgeSegmentDO knowledgeSegment = segmentMapper.selectById(id);
+        if (knowledgeSegment == null) {
+            throw exception(KNOWLEDGE_SEGMENT_NOT_EXISTS);
+        }
+        return knowledgeSegment;
+    }
 }

+ 9 - 0
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeService.java

@@ -5,6 +5,7 @@ import cn.iocoder.yudao.framework.common.pojo.PageResult;
 import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeCreateMyReqVO;
 import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeUpdateMyReqVO;
 import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO;
+import org.springframework.ai.vectorstore.VectorStore;
 
 /**
  * AI 知识库-基础信息 Service 接口
@@ -47,4 +48,12 @@ public interface AiKnowledgeService {
      * @return 知识库分页
      */
     PageResult<AiKnowledgeDO> getKnowledgePageMy(Long userId, PageParam pageReqVO);
+
+    /**
+     * 根据知识库编号获取向量存储实例
+     *
+     * @param knowledgeId 知识库编号
+     * @return 向量存储实例
+     */
+    VectorStore getVectorStoreById(Long knowledgeId);
 }

+ 13 - 0
yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeServiceImpl.java

@@ -10,9 +10,11 @@ import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnow
 import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO;
 import cn.iocoder.yudao.module.ai.dal.dataobject.model.AiChatModelDO;
 import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeMapper;
+import cn.iocoder.yudao.module.ai.service.model.AiApiKeyService;
 import cn.iocoder.yudao.module.ai.service.model.AiChatModelService;
 import jakarta.annotation.Resource;
 import lombok.extern.slf4j.Slf4j;
+import org.springframework.ai.vectorstore.VectorStore;
 import org.springframework.stereotype.Service;
 
 import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
@@ -32,6 +34,10 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
 
     @Resource
     private AiKnowledgeMapper knowledgeMapper;
+    @Resource
+    private AiChatModelService chatModelService;
+    @Resource
+    private AiApiKeyService apiKeyService;
 
     @Override
     public Long createKnowledgeMy(AiKnowledgeCreateMyReqVO createReqVO, Long userId) {
@@ -75,4 +81,11 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
         return knowledgeMapper.selectPageByMy(userId, pageReqVO);
     }
 
+    @Override
+    public VectorStore getVectorStoreById(Long knowledgeId) {
+        AiKnowledgeDO knowledge = validateKnowledgeExists(knowledgeId);
+        AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId());
+        return apiKeyService.getOrCreateVectorStore(model.getKeyId());
+    }
+
 }