'use client' import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react' import { useTranslation } from 'react-i18next' import { useContext } from 'use-context-selector' import { useBoolean } from 'ahooks' import { XMarkIcon } from '@heroicons/react/20/solid' import { RocketLaunchIcon } from '@heroicons/react/24/outline' import { RiCloseLine, } from '@remixicon/react' import Link from 'next/link' import { groupBy } from 'lodash-es' import PreviewItem, { PreviewType } from './preview-item' import LanguageSelect from './language-select' import s from './index.module.css' import unescape from './unescape' import escape from './escape' import cn from '@/utils/classnames' import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' import { createDocument, createFirstDocument, fetchFileIndexingEstimate as didFetchFileIndexingEstimate, fetchDefaultProcessRule, } from '@/service/datasets' import Button from '@/app/components/base/button' import Input from '@/app/components/base/input' import Loading from '@/app/components/base/loading' import FloatRightContainer from '@/app/components/base/float-right-container' import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' import { type RetrievalConfig } from '@/types/app' import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' import Toast from '@/app/components/base/toast' import { formatNumber } from '@/utils/format' import type { NotionPage } from '@/models/common' import { DataSourceProvider } from '@/models/common' import { DataSourceType, DocForm } from '@/models/datasets' import NotionIcon from '@/app/components/base/notion-icon' import Switch from '@/app/components/base/switch' import { MessageChatSquare } from '@/app/components/base/icons/src/public/common' import { useDatasetDetailContext } from '@/context/dataset-detail' import I18n from '@/context/i18n' import { IS_CE_EDITION } from '@/config' import { RETRIEVE_METHOD } from '@/types/app' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' import Tooltip from '@/app/components/base/tooltip' import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' import { LanguagesSupported } from '@/i18n/language' import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector' import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations' import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel' type ValueOf = T[keyof T] type StepTwoProps = { isSetting?: boolean documentDetail?: FullDocumentDetail isAPIKeySet: boolean onSetting: () => void datasetId?: string indexingType?: ValueOf dataSourceType: DataSourceType files: CustomFile[] notionPages?: NotionPage[] websitePages?: CrawlResultItem[] crawlOptions?: CrawlOptions websiteCrawlProvider?: DataSourceProvider websiteCrawlJobId?: string onStepChange?: (delta: number) => void updateIndexingTypeCache?: (type: string) => void updateResultCache?: (res: createDocumentResponse) => void onSave?: () => void onCancel?: () => void } enum SegmentType { AUTO = 'automatic', CUSTOM = 'custom', } enum IndexingType { QUALIFIED = 'high_quality', ECONOMICAL = 'economy', } const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' const StepTwo = ({ isSetting, documentDetail, isAPIKeySet, onSetting, datasetId, indexingType, dataSourceType: inCreatePageDataSourceType, files, notionPages = [], websitePages = [], crawlOptions, websiteCrawlProvider = DataSourceProvider.fireCrawl, websiteCrawlJobId = '', onStepChange, updateIndexingTypeCache, updateResultCache, onSave, onCancel, }: StepTwoProps) => { const { t } = useTranslation() const { locale } = useContext(I18n) const media = useBreakpoints() const isMobile = media === MediaType.mobile const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext() const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type) const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type const scrollRef = useRef(null) const [scrolled, setScrolled] = useState(false) const previewScrollRef = useRef(null) const [previewScrolled, setPreviewScrolled] = useState(false) const [segmentationType, setSegmentationType] = useState(SegmentType.AUTO) const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) const setSegmentIdentifier = useCallback((value: string) => { doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER) }, []) const [maxChunkLength, setMaxChunkLength] = useState(4000) // default chunk length const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000) const [overlap, setOverlap] = useState(50) const [rules, setRules] = useState([]) const [defaultConfig, setDefaultConfig] = useState() const hasSetIndexType = !!indexingType const [indexType, setIndexType] = useState>( (indexingType || isAPIKeySet) ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL, ) const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false) const [docForm, setDocForm] = useState( (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT, ) const [docLanguage, setDocLanguage] = useState( (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'), ) const [QATipHide, setQATipHide] = useState(false) const [previewSwitched, setPreviewSwitched] = useState(false) const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean() const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState(null) const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState(null) const fileIndexingEstimate = (() => { return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate })() const [isCreating, setIsCreating] = useState(false) const scrollHandle = (e: Event) => { if ((e.target as HTMLDivElement).scrollTop > 0) setScrolled(true) else setScrolled(false) } const previewScrollHandle = (e: Event) => { if ((e.target as HTMLDivElement).scrollTop > 0) setPreviewScrolled(true) else setPreviewScrolled(false) } const getFileName = (name: string) => { const arr = name.split('.') return arr.slice(0, -1).join('.') } const getRuleName = (key: string) => { if (key === 'remove_extra_spaces') return t('datasetCreation.stepTwo.removeExtraSpaces') if (key === 'remove_urls_emails') return t('datasetCreation.stepTwo.removeUrlEmails') if (key === 'remove_stopwords') return t('datasetCreation.stepTwo.removeStopwords') } const ruleChangeHandle = (id: string) => { const newRules = rules.map((rule) => { if (rule.id === id) { return { id: rule.id, enabled: !rule.enabled, } } return rule }) setRules(newRules) } const resetRules = () => { if (defaultConfig) { setSegmentIdentifier(defaultConfig.segmentation.separator) setMaxChunkLength(defaultConfig.segmentation.max_tokens) setOverlap(defaultConfig.segmentation.chunk_overlap) setRules(defaultConfig.pre_processing_rules) } } const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT, language?: string) => { // eslint-disable-next-line @typescript-eslint/no-use-before-define const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm, language)!) if (segmentationType === SegmentType.CUSTOM) setCustomFileIndexingEstimate(res) else setAutomaticFileIndexingEstimate(res) } const confirmChangeCustomConfig = () => { if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) { Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) }) return } setCustomFileIndexingEstimate(null) setShowPreview() fetchFileIndexingEstimate() setPreviewSwitched(false) } const getIndexing_technique = () => indexingType || indexType const getProcessRule = () => { const processRule: ProcessRule = { rules: {} as any, // api will check this. It will be removed after api refactored. mode: segmentationType, } if (segmentationType === SegmentType.CUSTOM) { const ruleObj = { pre_processing_rules: rules, segmentation: { separator: unescape(segmentIdentifier), max_tokens: maxChunkLength, chunk_overlap: overlap, }, } processRule.rules = ruleObj } return processRule } const getNotionInfo = () => { const workspacesMap = groupBy(notionPages, 'workspace_id') const workspaces = Object.keys(workspacesMap).map((workspaceId) => { return { workspaceId, pages: workspacesMap[workspaceId], } }) return workspaces.map((workspace) => { return { workspace_id: workspace.workspaceId, pages: workspace.pages.map((page) => { const { page_id, page_name, page_icon, type } = page return { page_id, page_name, page_icon, type, } }), } }) as NotionInfo[] } const getWebsiteInfo = () => { return { provider: websiteCrawlProvider, job_id: websiteCrawlJobId, urls: websitePages.map(page => page.source_url), only_main_content: crawlOptions?.only_main_content, } } const getFileIndexingEstimateParams = (docForm: DocForm, language?: string): IndexingEstimateParams | undefined => { if (dataSourceType === DataSourceType.FILE) { return { info_list: { data_source_type: dataSourceType, file_info_list: { file_ids: files.map(file => file.id) as string[], }, }, indexing_technique: getIndexing_technique() as string, process_rule: getProcessRule(), doc_form: docForm, doc_language: language || docLanguage, dataset_id: datasetId as string, } } if (dataSourceType === DataSourceType.NOTION) { return { info_list: { data_source_type: dataSourceType, notion_info_list: getNotionInfo(), }, indexing_technique: getIndexing_technique() as string, process_rule: getProcessRule(), doc_form: docForm, doc_language: language || docLanguage, dataset_id: datasetId as string, } } if (dataSourceType === DataSourceType.WEB) { return { info_list: { data_source_type: dataSourceType, website_info_list: getWebsiteInfo(), }, indexing_technique: getIndexing_technique() as string, process_rule: getProcessRule(), doc_form: docForm, doc_language: language || docLanguage, dataset_id: datasetId as string, } } } const { modelList: rerankModelList, defaultModel: rerankDefaultModel, currentModel: isRerankDefaultModelValid, } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding) const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding) const [embeddingModel, setEmbeddingModel] = useState( currentDataset?.embedding_model ? { provider: currentDataset.embedding_model_provider, model: currentDataset.embedding_model, } : { provider: defaultEmbeddingModel?.provider.provider || '', model: defaultEmbeddingModel?.model || '', }, ) const getCreationParams = () => { let params if (segmentationType === SegmentType.CUSTOM && overlap > maxChunkLength) { Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') }) return } if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) { Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) }) return } if (isSetting) { params = { original_document_id: documentDetail?.id, doc_form: docForm, doc_language: docLanguage, process_rule: getProcessRule(), // eslint-disable-next-line @typescript-eslint/no-use-before-define retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page. embedding_model: embeddingModel.model, // Readonly embedding_model_provider: embeddingModel.provider, // Readonly } as CreateDocumentReq } else { // create const indexMethod = getIndexing_technique() if ( !isReRankModelSelected({ rerankDefaultModel, isRerankDefaultModelValid: !!isRerankDefaultModelValid, rerankModelList, // eslint-disable-next-line @typescript-eslint/no-use-before-define retrievalConfig, indexMethod: indexMethod as string, }) ) { Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') }) return } const postRetrievalConfig = ensureRerankModelSelected({ rerankDefaultModel: rerankDefaultModel!, // eslint-disable-next-line @typescript-eslint/no-use-before-define retrievalConfig, indexMethod: indexMethod as string, }) params = { data_source: { type: dataSourceType, info_list: { data_source_type: dataSourceType, }, }, indexing_technique: getIndexing_technique(), process_rule: getProcessRule(), doc_form: docForm, doc_language: docLanguage, retrieval_model: postRetrievalConfig, embedding_model: embeddingModel.model, embedding_model_provider: embeddingModel.provider, } as CreateDocumentReq if (dataSourceType === DataSourceType.FILE) { params.data_source.info_list.file_info_list = { file_ids: files.map(file => file.id || '').filter(Boolean), } } if (dataSourceType === DataSourceType.NOTION) params.data_source.info_list.notion_info_list = getNotionInfo() if (dataSourceType === DataSourceType.WEB) params.data_source.info_list.website_info_list = getWebsiteInfo() } return params } const getRules = async () => { try { const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' }) const separator = res.rules.segmentation.separator setSegmentIdentifier(separator) setMaxChunkLength(res.rules.segmentation.max_tokens) setLimitMaxChunkLength(res.limits.indexing_max_segmentation_tokens_length) setOverlap(res.rules.segmentation.chunk_overlap) setRules(res.rules.pre_processing_rules) setDefaultConfig(res.rules) } catch (err) { console.log(err) } } const getRulesFromDetail = () => { if (documentDetail) { const rules = documentDetail.dataset_process_rule.rules const separator = rules.segmentation.separator const max = rules.segmentation.max_tokens const overlap = rules.segmentation.chunk_overlap setSegmentIdentifier(separator) setMaxChunkLength(max) setOverlap(overlap) setRules(rules.pre_processing_rules) setDefaultConfig(rules) } } const getDefaultMode = () => { if (documentDetail) setSegmentationType(documentDetail.dataset_process_rule.mode) } const createHandle = async () => { if (isCreating) return setIsCreating(true) try { let res const params = getCreationParams() if (!params) return false setIsCreating(true) if (!datasetId) { res = await createFirstDocument({ body: params as CreateDocumentReq, }) updateIndexingTypeCache && updateIndexingTypeCache(indexType as string) updateResultCache && updateResultCache(res) } else { res = await createDocument({ datasetId, body: params as CreateDocumentReq, }) updateIndexingTypeCache && updateIndexingTypeCache(indexType as string) updateResultCache && updateResultCache(res) } if (mutateDatasetRes) mutateDatasetRes() onStepChange && onStepChange(+1) isSetting && onSave && onSave() } catch (err) { Toast.notify({ type: 'error', message: `${err}`, }) } finally { setIsCreating(false) } } const handleSwitch = (state: boolean) => { if (state) setDocForm(DocForm.QA) else setDocForm(DocForm.TEXT) } const previewSwitch = async (language?: string) => { setPreviewSwitched(true) setIsLanguageSelectDisabled(true) if (segmentationType === SegmentType.AUTO) setAutomaticFileIndexingEstimate(null) else setCustomFileIndexingEstimate(null) try { await fetchFileIndexingEstimate(DocForm.QA, language) } finally { setIsLanguageSelectDisabled(false) } } const handleSelect = (language: string) => { setDocLanguage(language) // Switch language, re-cutter if (docForm === DocForm.QA && previewSwitched) previewSwitch(language) } const changeToEconomicalType = () => { if (!hasSetIndexType) { setIndexType(IndexingType.ECONOMICAL) setDocForm(DocForm.TEXT) } } useEffect(() => { // fetch rules if (!isSetting) { getRules() } else { getRulesFromDetail() getDefaultMode() } }, []) useEffect(() => { scrollRef.current?.addEventListener('scroll', scrollHandle) return () => { scrollRef.current?.removeEventListener('scroll', scrollHandle) } }, []) useLayoutEffect(() => { if (showPreview) { previewScrollRef.current?.addEventListener('scroll', previewScrollHandle) return () => { previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle) } } }, [showPreview]) useEffect(() => { if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA) setDocForm(DocForm.TEXT) }, [indexingType, docForm]) useEffect(() => { // get indexing type by props if (indexingType) setIndexType(indexingType as IndexingType) else setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) }, [isAPIKeySet, indexingType, datasetId]) useEffect(() => { if (segmentationType === SegmentType.AUTO) { setAutomaticFileIndexingEstimate(null) !isMobile && setShowPreview() fetchFileIndexingEstimate() setPreviewSwitched(false) } else { hidePreview() setCustomFileIndexingEstimate(null) setPreviewSwitched(false) } }, [segmentationType, indexType]) const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || { search_method: RETRIEVE_METHOD.semantic, reranking_enable: false, reranking_model: { reranking_provider_name: rerankDefaultModel?.provider.provider, reranking_model_name: rerankDefaultModel?.model, }, top_k: 3, score_threshold_enabled: false, score_threshold: 0.5, } as RetrievalConfig) return (
{t('datasetCreation.steps.two')} {(isMobile || !showPreview) && ( )}
{t('datasetCreation.stepTwo.segmentation')}
setSegmentationType(SegmentType.AUTO)} >
{t('datasetCreation.stepTwo.auto')}
{t('datasetCreation.stepTwo.autoDescription')}
setSegmentationType(SegmentType.CUSTOM)} >
{t('datasetCreation.stepTwo.custom')}
{t('datasetCreation.stepTwo.customDescription')}
{segmentationType === SegmentType.CUSTOM && (
{t('datasetCreation.stepTwo.separator')} {t('datasetCreation.stepTwo.separatorTip')}
} />
setSegmentIdentifier(e.target.value)} />
{t('datasetCreation.stepTwo.maxLength')}
setMaxChunkLength(parseInt(e.target.value.replace(/^0+/, ''), 10))} />
{t('datasetCreation.stepTwo.overlap')} {t('datasetCreation.stepTwo.overlapTip')}
} />
setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))} />
{t('datasetCreation.stepTwo.rules')}
{rules.map(rule => (
ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
))}
)}
{t('datasetCreation.stepTwo.indexMode')}
{(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
{ if (isAPIKeySet) setIndexType(IndexingType.QUALIFIED) }} > {!hasSetIndexType && }
{t('datasetCreation.stepTwo.qualified')} {!hasSetIndexType && {t('datasetCreation.stepTwo.recommend')}}
{t('datasetCreation.stepTwo.qualifiedTip')}
{!isAPIKeySet && (
{t('datasetCreation.stepTwo.warning')}  {t('datasetCreation.stepTwo.click')}
)}
)} {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
{!hasSetIndexType && }
{t('datasetCreation.stepTwo.economical')}
{t('datasetCreation.stepTwo.economicalTip')}
)}
{hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
{t('datasetCreation.stepTwo.indexSettingTip')} {t('datasetCreation.stepTwo.datasetSettingLink')}
)} {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
{t('datasetCreation.stepTwo.QATitle')}
{t('datasetCreation.stepTwo.QALanguage')}
{docForm === DocForm.QA && !QATipHide && (
{t('datasetCreation.stepTwo.QATip')} setQATipHide(true)} />
)}
)} {/* Embedding model */} {indexType === IndexingType.QUALIFIED && (
{t('datasetSettings.form.embeddingModel')}
{ setEmbeddingModel(model) }} /> {!!datasetId && (
{t('datasetCreation.stepTwo.indexSettingTip')} {t('datasetCreation.stepTwo.datasetSettingLink')}
)}
)} {/* Retrieval Method Config */}
{!datasetId ? (
{t('datasetSettings.form.retrievalSetting.title')}
{t('datasetSettings.form.retrievalSetting.learnMore')} {t('datasetSettings.form.retrievalSetting.longDescription')}
) : (
{t('datasetSettings.form.retrievalSetting.title')}
)}
{ getIndexing_technique() === IndexingType.QUALIFIED ? ( ) : ( ) }
{dataSourceType === DataSourceType.FILE && ( <>
{t('datasetCreation.stepTwo.fileSource')}
{getFileName(files[0].name || '')} {files.length > 1 && ( {t('datasetCreation.stepTwo.other')} {files.length - 1} {t('datasetCreation.stepTwo.fileUnit')} )}
)} {dataSourceType === DataSourceType.NOTION && ( <>
{t('datasetCreation.stepTwo.notionSource')}
{notionPages[0]?.page_name} {notionPages.length > 1 && ( {t('datasetCreation.stepTwo.other')} {notionPages.length - 1} {t('datasetCreation.stepTwo.notionUnit')} )}
)} {dataSourceType === DataSourceType.WEB && ( <>
{t('datasetCreation.stepTwo.websiteSource')}
{websitePages[0].source_url} {websitePages.length > 1 && ( {t('datasetCreation.stepTwo.other')} {websitePages.length - 1} {t('datasetCreation.stepTwo.webpageUnit')} )}
)}
{t('datasetCreation.stepTwo.estimateSegment')}
{ fileIndexingEstimate ? (
{formatNumber(fileIndexingEstimate.total_segments)}
) : (
{t('datasetCreation.stepTwo.calculating')}
) }
{!isSetting ? (
) : (
)}
{showPreview &&
{t('datasetCreation.stepTwo.previewTitle')}
{docForm === DocForm.QA && !previewSwitched && ( )}
{docForm === DocForm.QA && !previewSwitched && (
{t('datasetCreation.stepTwo.previewSwitchTipStart')} {t('datasetCreation.stepTwo.previewSwitchTipEnd')}
)}
{previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && ( <> {fileIndexingEstimate?.qa_preview.map((item, index) => ( ))} )} {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && ( <> {fileIndexingEstimate?.preview.map((item, index) => ( ))} )} {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
)} {!previewSwitched && !fileIndexingEstimate?.preview && (
)}
} {!showPreview && (
{t('datasetCreation.stepTwo.sideTipTitle')}

{t('datasetCreation.stepTwo.sideTipP1')}

{t('datasetCreation.stepTwo.sideTipP2')}

{t('datasetCreation.stepTwo.sideTipP3')}

{t('datasetCreation.stepTwo.sideTipP4')}

)}
) } export default StepTwo