|
@@ -14,8 +14,9 @@ import jakarta.annotation.Resource;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import org.springframework.ai.document.Document;
|
|
import org.springframework.ai.document.Document;
|
|
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
|
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
|
-import org.springframework.ai.tokenizer.JTokkitTokenCountEstimator;
|
|
|
|
|
|
+import org.springframework.ai.tokenizer.TokenCountEstimator;
|
|
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
|
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
|
|
|
+import org.springframework.ai.vectorstore.RedisVectorStore;
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
@@ -39,52 +40,49 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
|
|
|
|
|
|
@Resource
|
|
@Resource
|
|
private TokenTextSplitter tokenTextSplitter;
|
|
private TokenTextSplitter tokenTextSplitter;
|
|
-
|
|
|
|
@Resource
|
|
@Resource
|
|
- private AiEmbeddingService embeddingService;
|
|
|
|
|
|
+ private TokenCountEstimator TOKEN_COUNT_ESTIMATOR;
|
|
|
|
+ @Resource
|
|
|
|
+ private RedisVectorStore vectorStore;
|
|
|
|
|
|
- // TODO @xin:@Resource 注入
|
|
|
|
- private static final JTokkitTokenCountEstimator TOKEN_COUNT_ESTIMATOR = new JTokkitTokenCountEstimator();
|
|
|
|
|
|
|
|
// TODO xiaoxin 临时测试用,后续删
|
|
// TODO xiaoxin 临时测试用,后续删
|
|
@Value("classpath:/webapp/test/Fel.pdf")
|
|
@Value("classpath:/webapp/test/Fel.pdf")
|
|
private org.springframework.core.io.Resource data;
|
|
private org.springframework.core.io.Resource data;
|
|
|
|
|
|
// TODO 芋艿:需要 review 下,代码格式;
|
|
// TODO 芋艿:需要 review 下,代码格式;
|
|
- // TODO @xin:最好有 1、/2、/3 这种,让代码更有层次感
|
|
|
|
@Override
|
|
@Override
|
|
@Transactional(rollbackFor = Exception.class)
|
|
@Transactional(rollbackFor = Exception.class)
|
|
public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) {
|
|
public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) {
|
|
// TODO xiaoxin 后续从 url 加载
|
|
// TODO xiaoxin 后续从 url 加载
|
|
TikaDocumentReader loader = new TikaDocumentReader(data);
|
|
TikaDocumentReader loader = new TikaDocumentReader(data);
|
|
- // 加载文档
|
|
|
|
|
|
+ // 1.1 加载文档
|
|
List<Document> documents = loader.get();
|
|
List<Document> documents = loader.get();
|
|
Document document = CollUtil.getFirst(documents);
|
|
Document document = CollUtil.getFirst(documents);
|
|
// TODO @xin:是不是不存在,就抛出异常呀;厚泽 return 呀;
|
|
// TODO @xin:是不是不存在,就抛出异常呀;厚泽 return 呀;
|
|
- // TODO 芋艿 文档层面有没有可能会比较大,这两个字段是否可以从分段表计算得出?回复:先直接算;
|
|
|
|
Integer tokens = Objects.nonNull(document) ? TOKEN_COUNT_ESTIMATOR.estimate(document.getContent()) : 0;
|
|
Integer tokens = Objects.nonNull(document) ? TOKEN_COUNT_ESTIMATOR.estimate(document.getContent()) : 0;
|
|
Integer wordCount = Objects.nonNull(document) ? document.getContent().length() : 0;
|
|
Integer wordCount = Objects.nonNull(document) ? document.getContent().length() : 0;
|
|
|
|
|
|
AiKnowledgeDocumentDO documentDO = BeanUtils.toBean(createReqVO, AiKnowledgeDocumentDO.class)
|
|
AiKnowledgeDocumentDO documentDO = BeanUtils.toBean(createReqVO, AiKnowledgeDocumentDO.class)
|
|
.setTokens(tokens).setWordCount(wordCount)
|
|
.setTokens(tokens).setWordCount(wordCount)
|
|
.setStatus(CommonStatusEnum.ENABLE.getStatus()).setSliceStatus(AiKnowledgeDocumentStatusEnum.SUCCESS.getStatus());
|
|
.setStatus(CommonStatusEnum.ENABLE.getStatus()).setSliceStatus(AiKnowledgeDocumentStatusEnum.SUCCESS.getStatus());
|
|
- // 文档记录入库
|
|
|
|
|
|
+ // 1.2 文档记录入库
|
|
documentMapper.insert(documentDO);
|
|
documentMapper.insert(documentDO);
|
|
Long documentId = documentDO.getId();
|
|
Long documentId = documentDO.getId();
|
|
if (CollUtil.isEmpty(documents)) {
|
|
if (CollUtil.isEmpty(documents)) {
|
|
return documentId;
|
|
return documentId;
|
|
}
|
|
}
|
|
|
|
|
|
- // 文档分段
|
|
|
|
|
|
+ // 2.1 文档分段
|
|
List<Document> segments = tokenTextSplitter.apply(documents);
|
|
List<Document> segments = tokenTextSplitter.apply(documents);
|
|
- // 分段内容入库
|
|
|
|
|
|
+ // 2.2 分段内容入库
|
|
List<AiKnowledgeSegmentDO> segmentDOList = CollectionUtils.convertList(segments,
|
|
List<AiKnowledgeSegmentDO> segmentDOList = CollectionUtils.convertList(segments,
|
|
segment -> new AiKnowledgeSegmentDO().setContent(segment.getContent()).setDocumentId(documentId)
|
|
segment -> new AiKnowledgeSegmentDO().setContent(segment.getContent()).setDocumentId(documentId)
|
|
.setTokens(TOKEN_COUNT_ESTIMATOR.estimate(segment.getContent())).setWordCount(segment.getContent().length())
|
|
.setTokens(TOKEN_COUNT_ESTIMATOR.estimate(segment.getContent())).setWordCount(segment.getContent().length())
|
|
.setStatus(CommonStatusEnum.ENABLE.getStatus()));
|
|
.setStatus(CommonStatusEnum.ENABLE.getStatus()));
|
|
segmentMapper.insertBatch(segmentDOList);
|
|
segmentMapper.insertBatch(segmentDOList);
|
|
- // 向量化并存储
|
|
|
|
- embeddingService.add(segments);
|
|
|
|
|
|
+ // 3 向量化并存储
|
|
|
|
+ vectorStore.add(segments);
|
|
return documentId;
|
|
return documentId;
|
|
}
|
|
}
|
|
|
|
|