index.tsx 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024
  1. 'use client'
  2. import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
  3. import { useTranslation } from 'react-i18next'
  4. import { useContext } from 'use-context-selector'
  5. import { useBoolean } from 'ahooks'
  6. import { XMarkIcon } from '@heroicons/react/20/solid'
  7. import { RocketLaunchIcon } from '@heroicons/react/24/outline'
  8. import {
  9. RiCloseLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import { groupBy } from 'lodash-es'
  13. import PreviewItem, { PreviewType } from './preview-item'
  14. import LanguageSelect from './language-select'
  15. import s from './index.module.css'
  16. import unescape from './unescape'
  17. import escape from './escape'
  18. import cn from '@/utils/classnames'
  19. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  20. import {
  21. createDocument,
  22. createFirstDocument,
  23. fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
  24. fetchDefaultProcessRule,
  25. } from '@/service/datasets'
  26. import Button from '@/app/components/base/button'
  27. import Input from '@/app/components/base/input'
  28. import Loading from '@/app/components/base/loading'
  29. import FloatRightContainer from '@/app/components/base/float-right-container'
  30. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  31. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  32. import { type RetrievalConfig } from '@/types/app'
  33. import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  34. import Toast from '@/app/components/base/toast'
  35. import { formatNumber } from '@/utils/format'
  36. import type { NotionPage } from '@/models/common'
  37. import { DataSourceProvider } from '@/models/common'
  38. import { DataSourceType, DocForm } from '@/models/datasets'
  39. import NotionIcon from '@/app/components/base/notion-icon'
  40. import Switch from '@/app/components/base/switch'
  41. import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
  42. import { useDatasetDetailContext } from '@/context/dataset-detail'
  43. import I18n from '@/context/i18n'
  44. import { IS_CE_EDITION } from '@/config'
  45. import { RETRIEVE_METHOD } from '@/types/app'
  46. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  47. import Tooltip from '@/app/components/base/tooltip'
  48. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  49. import { LanguagesSupported } from '@/i18n/language'
  50. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  51. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  53. import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel'
  54. type ValueOf<T> = T[keyof T]
  55. type StepTwoProps = {
  56. isSetting?: boolean
  57. documentDetail?: FullDocumentDetail
  58. isAPIKeySet: boolean
  59. onSetting: () => void
  60. datasetId?: string
  61. indexingType?: ValueOf<IndexingType>
  62. dataSourceType: DataSourceType
  63. files: CustomFile[]
  64. notionPages?: NotionPage[]
  65. websitePages?: CrawlResultItem[]
  66. crawlOptions?: CrawlOptions
  67. websiteCrawlProvider?: DataSourceProvider
  68. websiteCrawlJobId?: string
  69. onStepChange?: (delta: number) => void
  70. updateIndexingTypeCache?: (type: string) => void
  71. updateResultCache?: (res: createDocumentResponse) => void
  72. onSave?: () => void
  73. onCancel?: () => void
  74. }
  75. enum SegmentType {
  76. AUTO = 'automatic',
  77. CUSTOM = 'custom',
  78. }
  79. enum IndexingType {
  80. QUALIFIED = 'high_quality',
  81. ECONOMICAL = 'economy',
  82. }
  83. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  84. const StepTwo = ({
  85. isSetting,
  86. documentDetail,
  87. isAPIKeySet,
  88. onSetting,
  89. datasetId,
  90. indexingType,
  91. dataSourceType: inCreatePageDataSourceType,
  92. files,
  93. notionPages = [],
  94. websitePages = [],
  95. crawlOptions,
  96. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  97. websiteCrawlJobId = '',
  98. onStepChange,
  99. updateIndexingTypeCache,
  100. updateResultCache,
  101. onSave,
  102. onCancel,
  103. }: StepTwoProps) => {
  104. const { t } = useTranslation()
  105. const { locale } = useContext(I18n)
  106. const media = useBreakpoints()
  107. const isMobile = media === MediaType.mobile
  108. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  109. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  110. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  111. const scrollRef = useRef<HTMLDivElement>(null)
  112. const [scrolled, setScrolled] = useState(false)
  113. const previewScrollRef = useRef<HTMLDivElement>(null)
  114. const [previewScrolled, setPreviewScrolled] = useState(false)
  115. const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
  116. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  117. const setSegmentIdentifier = useCallback((value: string) => {
  118. doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
  119. }, [])
  120. const [max, setMax] = useState(4000) // default chunk length
  121. const [overlap, setOverlap] = useState(50)
  122. const [rules, setRules] = useState<PreProcessingRule[]>([])
  123. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  124. const hasSetIndexType = !!indexingType
  125. const [indexType, setIndexType] = useState<ValueOf<IndexingType>>(
  126. (indexingType
  127. || isAPIKeySet)
  128. ? IndexingType.QUALIFIED
  129. : IndexingType.ECONOMICAL,
  130. )
  131. const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false)
  132. const [docForm, setDocForm] = useState<DocForm | string>(
  133. (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
  134. )
  135. const [docLanguage, setDocLanguage] = useState<string>(
  136. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
  137. )
  138. const [QATipHide, setQATipHide] = useState(false)
  139. const [previewSwitched, setPreviewSwitched] = useState(false)
  140. const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
  141. const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  142. const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  143. const fileIndexingEstimate = (() => {
  144. return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
  145. })()
  146. const [isCreating, setIsCreating] = useState(false)
  147. const scrollHandle = (e: Event) => {
  148. if ((e.target as HTMLDivElement).scrollTop > 0)
  149. setScrolled(true)
  150. else
  151. setScrolled(false)
  152. }
  153. const previewScrollHandle = (e: Event) => {
  154. if ((e.target as HTMLDivElement).scrollTop > 0)
  155. setPreviewScrolled(true)
  156. else
  157. setPreviewScrolled(false)
  158. }
  159. const getFileName = (name: string) => {
  160. const arr = name.split('.')
  161. return arr.slice(0, -1).join('.')
  162. }
  163. const getRuleName = (key: string) => {
  164. if (key === 'remove_extra_spaces')
  165. return t('datasetCreation.stepTwo.removeExtraSpaces')
  166. if (key === 'remove_urls_emails')
  167. return t('datasetCreation.stepTwo.removeUrlEmails')
  168. if (key === 'remove_stopwords')
  169. return t('datasetCreation.stepTwo.removeStopwords')
  170. }
  171. const ruleChangeHandle = (id: string) => {
  172. const newRules = rules.map((rule) => {
  173. if (rule.id === id) {
  174. return {
  175. id: rule.id,
  176. enabled: !rule.enabled,
  177. }
  178. }
  179. return rule
  180. })
  181. setRules(newRules)
  182. }
  183. const resetRules = () => {
  184. if (defaultConfig) {
  185. setSegmentIdentifier(defaultConfig.segmentation.separator)
  186. setMax(defaultConfig.segmentation.max_tokens)
  187. setOverlap(defaultConfig.segmentation.chunk_overlap)
  188. setRules(defaultConfig.pre_processing_rules)
  189. }
  190. }
  191. const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT, language?: string) => {
  192. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  193. const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm, language)!)
  194. if (segmentationType === SegmentType.CUSTOM)
  195. setCustomFileIndexingEstimate(res)
  196. else
  197. setAutomaticFileIndexingEstimate(res)
  198. }
  199. const confirmChangeCustomConfig = () => {
  200. if (segmentationType === SegmentType.CUSTOM && max > 4000) {
  201. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
  202. return
  203. }
  204. setCustomFileIndexingEstimate(null)
  205. setShowPreview()
  206. fetchFileIndexingEstimate()
  207. setPreviewSwitched(false)
  208. }
  209. const getIndexing_technique = () => indexingType || indexType
  210. const getProcessRule = () => {
  211. const processRule: ProcessRule = {
  212. rules: {} as any, // api will check this. It will be removed after api refactored.
  213. mode: segmentationType,
  214. }
  215. if (segmentationType === SegmentType.CUSTOM) {
  216. const ruleObj = {
  217. pre_processing_rules: rules,
  218. segmentation: {
  219. separator: unescape(segmentIdentifier),
  220. max_tokens: max,
  221. chunk_overlap: overlap,
  222. },
  223. }
  224. processRule.rules = ruleObj
  225. }
  226. return processRule
  227. }
  228. const getNotionInfo = () => {
  229. const workspacesMap = groupBy(notionPages, 'workspace_id')
  230. const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
  231. return {
  232. workspaceId,
  233. pages: workspacesMap[workspaceId],
  234. }
  235. })
  236. return workspaces.map((workspace) => {
  237. return {
  238. workspace_id: workspace.workspaceId,
  239. pages: workspace.pages.map((page) => {
  240. const { page_id, page_name, page_icon, type } = page
  241. return {
  242. page_id,
  243. page_name,
  244. page_icon,
  245. type,
  246. }
  247. }),
  248. }
  249. }) as NotionInfo[]
  250. }
  251. const getWebsiteInfo = () => {
  252. return {
  253. provider: websiteCrawlProvider,
  254. job_id: websiteCrawlJobId,
  255. urls: websitePages.map(page => page.source_url),
  256. only_main_content: crawlOptions?.only_main_content,
  257. }
  258. }
  259. const getFileIndexingEstimateParams = (docForm: DocForm, language?: string): IndexingEstimateParams | undefined => {
  260. if (dataSourceType === DataSourceType.FILE) {
  261. return {
  262. info_list: {
  263. data_source_type: dataSourceType,
  264. file_info_list: {
  265. file_ids: files.map(file => file.id) as string[],
  266. },
  267. },
  268. indexing_technique: getIndexing_technique() as string,
  269. process_rule: getProcessRule(),
  270. doc_form: docForm,
  271. doc_language: language || docLanguage,
  272. dataset_id: datasetId as string,
  273. }
  274. }
  275. if (dataSourceType === DataSourceType.NOTION) {
  276. return {
  277. info_list: {
  278. data_source_type: dataSourceType,
  279. notion_info_list: getNotionInfo(),
  280. },
  281. indexing_technique: getIndexing_technique() as string,
  282. process_rule: getProcessRule(),
  283. doc_form: docForm,
  284. doc_language: language || docLanguage,
  285. dataset_id: datasetId as string,
  286. }
  287. }
  288. if (dataSourceType === DataSourceType.WEB) {
  289. return {
  290. info_list: {
  291. data_source_type: dataSourceType,
  292. website_info_list: getWebsiteInfo(),
  293. },
  294. indexing_technique: getIndexing_technique() as string,
  295. process_rule: getProcessRule(),
  296. doc_form: docForm,
  297. doc_language: language || docLanguage,
  298. dataset_id: datasetId as string,
  299. }
  300. }
  301. }
  302. const {
  303. modelList: rerankModelList,
  304. defaultModel: rerankDefaultModel,
  305. currentModel: isRerankDefaultModelValid,
  306. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  307. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  308. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  309. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  310. currentDataset?.embedding_model
  311. ? {
  312. provider: currentDataset.embedding_model_provider,
  313. model: currentDataset.embedding_model,
  314. }
  315. : {
  316. provider: defaultEmbeddingModel?.provider.provider || '',
  317. model: defaultEmbeddingModel?.model || '',
  318. },
  319. )
  320. const getCreationParams = () => {
  321. let params
  322. if (segmentationType === SegmentType.CUSTOM && overlap > max) {
  323. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  324. return
  325. }
  326. if (segmentationType === SegmentType.CUSTOM && max > 4000) {
  327. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
  328. return
  329. }
  330. if (isSetting) {
  331. params = {
  332. original_document_id: documentDetail?.id,
  333. doc_form: docForm,
  334. doc_language: docLanguage,
  335. process_rule: getProcessRule(),
  336. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  337. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  338. embedding_model: embeddingModel.model, // Readonly
  339. embedding_model_provider: embeddingModel.provider, // Readonly
  340. } as CreateDocumentReq
  341. }
  342. else { // create
  343. const indexMethod = getIndexing_technique()
  344. if (
  345. !isReRankModelSelected({
  346. rerankDefaultModel,
  347. isRerankDefaultModelValid: !!isRerankDefaultModelValid,
  348. rerankModelList,
  349. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  350. retrievalConfig,
  351. indexMethod: indexMethod as string,
  352. })
  353. ) {
  354. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  355. return
  356. }
  357. const postRetrievalConfig = ensureRerankModelSelected({
  358. rerankDefaultModel: rerankDefaultModel!,
  359. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  360. retrievalConfig,
  361. indexMethod: indexMethod as string,
  362. })
  363. params = {
  364. data_source: {
  365. type: dataSourceType,
  366. info_list: {
  367. data_source_type: dataSourceType,
  368. },
  369. },
  370. indexing_technique: getIndexing_technique(),
  371. process_rule: getProcessRule(),
  372. doc_form: docForm,
  373. doc_language: docLanguage,
  374. retrieval_model: postRetrievalConfig,
  375. embedding_model: embeddingModel.model,
  376. embedding_model_provider: embeddingModel.provider,
  377. } as CreateDocumentReq
  378. if (dataSourceType === DataSourceType.FILE) {
  379. params.data_source.info_list.file_info_list = {
  380. file_ids: files.map(file => file.id || '').filter(Boolean),
  381. }
  382. }
  383. if (dataSourceType === DataSourceType.NOTION)
  384. params.data_source.info_list.notion_info_list = getNotionInfo()
  385. if (dataSourceType === DataSourceType.WEB)
  386. params.data_source.info_list.website_info_list = getWebsiteInfo()
  387. }
  388. return params
  389. }
  390. const getRules = async () => {
  391. try {
  392. const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
  393. const separator = res.rules.segmentation.separator
  394. setSegmentIdentifier(separator)
  395. setMax(res.rules.segmentation.max_tokens)
  396. setOverlap(res.rules.segmentation.chunk_overlap)
  397. setRules(res.rules.pre_processing_rules)
  398. setDefaultConfig(res.rules)
  399. }
  400. catch (err) {
  401. console.log(err)
  402. }
  403. }
  404. const getRulesFromDetail = () => {
  405. if (documentDetail) {
  406. const rules = documentDetail.dataset_process_rule.rules
  407. const separator = rules.segmentation.separator
  408. const max = rules.segmentation.max_tokens
  409. const overlap = rules.segmentation.chunk_overlap
  410. setSegmentIdentifier(separator)
  411. setMax(max)
  412. setOverlap(overlap)
  413. setRules(rules.pre_processing_rules)
  414. setDefaultConfig(rules)
  415. }
  416. }
  417. const getDefaultMode = () => {
  418. if (documentDetail)
  419. setSegmentationType(documentDetail.dataset_process_rule.mode)
  420. }
  421. const createHandle = async () => {
  422. if (isCreating)
  423. return
  424. setIsCreating(true)
  425. try {
  426. let res
  427. const params = getCreationParams()
  428. if (!params)
  429. return false
  430. setIsCreating(true)
  431. if (!datasetId) {
  432. res = await createFirstDocument({
  433. body: params as CreateDocumentReq,
  434. })
  435. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  436. updateResultCache && updateResultCache(res)
  437. }
  438. else {
  439. res = await createDocument({
  440. datasetId,
  441. body: params as CreateDocumentReq,
  442. })
  443. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  444. updateResultCache && updateResultCache(res)
  445. }
  446. if (mutateDatasetRes)
  447. mutateDatasetRes()
  448. onStepChange && onStepChange(+1)
  449. isSetting && onSave && onSave()
  450. }
  451. catch (err) {
  452. Toast.notify({
  453. type: 'error',
  454. message: `${err}`,
  455. })
  456. }
  457. finally {
  458. setIsCreating(false)
  459. }
  460. }
  461. const handleSwitch = (state: boolean) => {
  462. if (state)
  463. setDocForm(DocForm.QA)
  464. else
  465. setDocForm(DocForm.TEXT)
  466. }
  467. const previewSwitch = async (language?: string) => {
  468. setPreviewSwitched(true)
  469. setIsLanguageSelectDisabled(true)
  470. if (segmentationType === SegmentType.AUTO)
  471. setAutomaticFileIndexingEstimate(null)
  472. else
  473. setCustomFileIndexingEstimate(null)
  474. try {
  475. await fetchFileIndexingEstimate(DocForm.QA, language)
  476. }
  477. finally {
  478. setIsLanguageSelectDisabled(false)
  479. }
  480. }
  481. const handleSelect = (language: string) => {
  482. setDocLanguage(language)
  483. // Switch language, re-cutter
  484. if (docForm === DocForm.QA && previewSwitched)
  485. previewSwitch(language)
  486. }
  487. const changeToEconomicalType = () => {
  488. if (!hasSetIndexType) {
  489. setIndexType(IndexingType.ECONOMICAL)
  490. setDocForm(DocForm.TEXT)
  491. }
  492. }
  493. useEffect(() => {
  494. // fetch rules
  495. if (!isSetting) {
  496. getRules()
  497. }
  498. else {
  499. getRulesFromDetail()
  500. getDefaultMode()
  501. }
  502. }, [])
  503. useEffect(() => {
  504. scrollRef.current?.addEventListener('scroll', scrollHandle)
  505. return () => {
  506. scrollRef.current?.removeEventListener('scroll', scrollHandle)
  507. }
  508. }, [])
  509. useLayoutEffect(() => {
  510. if (showPreview) {
  511. previewScrollRef.current?.addEventListener('scroll', previewScrollHandle)
  512. return () => {
  513. previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle)
  514. }
  515. }
  516. }, [showPreview])
  517. useEffect(() => {
  518. if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
  519. setDocForm(DocForm.TEXT)
  520. }, [indexingType, docForm])
  521. useEffect(() => {
  522. // get indexing type by props
  523. if (indexingType)
  524. setIndexType(indexingType as IndexingType)
  525. else
  526. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  527. }, [isAPIKeySet, indexingType, datasetId])
  528. useEffect(() => {
  529. if (segmentationType === SegmentType.AUTO) {
  530. setAutomaticFileIndexingEstimate(null)
  531. !isMobile && setShowPreview()
  532. fetchFileIndexingEstimate()
  533. setPreviewSwitched(false)
  534. }
  535. else {
  536. hidePreview()
  537. setCustomFileIndexingEstimate(null)
  538. setPreviewSwitched(false)
  539. }
  540. }, [segmentationType, indexType])
  541. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  542. search_method: RETRIEVE_METHOD.semantic,
  543. reranking_enable: false,
  544. reranking_model: {
  545. reranking_provider_name: rerankDefaultModel?.provider.provider,
  546. reranking_model_name: rerankDefaultModel?.model,
  547. },
  548. top_k: 3,
  549. score_threshold_enabled: false,
  550. score_threshold: 0.5,
  551. } as RetrievalConfig)
  552. return (
  553. <div className='flex w-full h-full'>
  554. <div ref={scrollRef} className='relative h-full w-full overflow-y-scroll'>
  555. <div className={cn(s.pageHeader, scrolled && s.fixed, isMobile && '!px-6')}>
  556. <span>{t('datasetCreation.steps.two')}</span>
  557. {(isMobile || !showPreview) && (
  558. <Button
  559. className='border-[0.5px] !h-8 hover:outline hover:outline-[0.5px] hover:outline-gray-300 text-gray-700 font-medium bg-white shadow-[0px_1px_2px_0px_rgba(16,24,40,0.05)]'
  560. onClick={setShowPreview}
  561. >
  562. <Tooltip>
  563. <div className="flex flex-row items-center">
  564. <RocketLaunchIcon className="h-4 w-4 mr-1.5 stroke-[1.8px]" />
  565. <span className="text-[13px]">{t('datasetCreation.stepTwo.previewTitleButton')}</span>
  566. </div>
  567. </Tooltip>
  568. </Button>
  569. )}
  570. </div>
  571. <div className={cn(s.form, isMobile && '!px-4')}>
  572. <div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
  573. <div className='max-w-[640px]'>
  574. <div
  575. className={cn(
  576. s.radioItem,
  577. s.segmentationItem,
  578. segmentationType === SegmentType.AUTO && s.active,
  579. )}
  580. onClick={() => setSegmentationType(SegmentType.AUTO)}
  581. >
  582. <span className={cn(s.typeIcon, s.auto)} />
  583. <span className={cn(s.radio)} />
  584. <div className={s.typeHeader}>
  585. <div className={s.title}>{t('datasetCreation.stepTwo.auto')}</div>
  586. <div className={s.tip}>{t('datasetCreation.stepTwo.autoDescription')}</div>
  587. </div>
  588. </div>
  589. <div
  590. className={cn(
  591. s.radioItem,
  592. s.segmentationItem,
  593. segmentationType === SegmentType.CUSTOM && s.active,
  594. segmentationType === SegmentType.CUSTOM && s.custom,
  595. )}
  596. onClick={() => setSegmentationType(SegmentType.CUSTOM)}
  597. >
  598. <span className={cn(s.typeIcon, s.customize)} />
  599. <span className={cn(s.radio)} />
  600. <div className={s.typeHeader}>
  601. <div className={s.title}>{t('datasetCreation.stepTwo.custom')}</div>
  602. <div className={s.tip}>{t('datasetCreation.stepTwo.customDescription')}</div>
  603. </div>
  604. {segmentationType === SegmentType.CUSTOM && (
  605. <div className={s.typeFormBody}>
  606. <div className={s.formRow}>
  607. <div className='w-full'>
  608. <div className={s.label}>
  609. {t('datasetCreation.stepTwo.separator')}
  610. <Tooltip
  611. popupContent={
  612. <div className='max-w-[200px]'>
  613. {t('datasetCreation.stepTwo.separatorTip')}
  614. </div>
  615. }
  616. />
  617. </div>
  618. <Input
  619. type="text"
  620. className='h-9'
  621. placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
  622. onChange={e => setSegmentIdentifier(e.target.value)}
  623. />
  624. </div>
  625. </div>
  626. <div className={s.formRow}>
  627. <div className='w-full'>
  628. <div className={s.label}>{t('datasetCreation.stepTwo.maxLength')}</div>
  629. <Input
  630. type="number"
  631. className='h-9'
  632. placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
  633. value={max}
  634. max={4000}
  635. min={1}
  636. onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  637. />
  638. </div>
  639. </div>
  640. <div className={s.formRow}>
  641. <div className='w-full'>
  642. <div className={s.label}>
  643. {t('datasetCreation.stepTwo.overlap')}
  644. <Tooltip
  645. popupContent={
  646. <div className='max-w-[200px]'>
  647. {t('datasetCreation.stepTwo.overlapTip')}
  648. </div>
  649. }
  650. />
  651. </div>
  652. <Input
  653. type="number"
  654. className='h-9'
  655. placeholder={t('datasetCreation.stepTwo.overlap') || ''}
  656. value={overlap}
  657. min={1}
  658. onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  659. />
  660. </div>
  661. </div>
  662. <div className={s.formRow}>
  663. <div className='w-full flex flex-col gap-1'>
  664. <div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>
  665. {rules.map(rule => (
  666. <div key={rule.id} className={s.ruleItem}>
  667. <input id={rule.id} type="checkbox" checked={rule.enabled} onChange={() => ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
  668. <label htmlFor={rule.id} className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
  669. </div>
  670. ))}
  671. </div>
  672. </div>
  673. <div className={s.formFooter}>
  674. <Button variant="primary" className={cn(s.button)} onClick={confirmChangeCustomConfig}>{t('datasetCreation.stepTwo.preview')}</Button>
  675. <Button className={cn(s.button, 'ml-2')} onClick={resetRules}>{t('datasetCreation.stepTwo.reset')}</Button>
  676. </div>
  677. </div>
  678. )}
  679. </div>
  680. </div>
  681. <div className={s.label}>{t('datasetCreation.stepTwo.indexMode')}</div>
  682. <div className='max-w-[640px]'>
  683. <div className='flex items-center gap-3 flex-wrap sm:flex-nowrap'>
  684. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  685. <div
  686. className={cn(
  687. s.radioItem,
  688. s.indexItem,
  689. !isAPIKeySet && s.disabled,
  690. !hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
  691. hasSetIndexType && s.disabled,
  692. hasSetIndexType && '!w-full !min-h-[96px]',
  693. )}
  694. onClick={() => {
  695. if (isAPIKeySet)
  696. setIndexType(IndexingType.QUALIFIED)
  697. }}
  698. >
  699. <span className={cn(s.typeIcon, s.qualified)} />
  700. {!hasSetIndexType && <span className={cn(s.radio)} />}
  701. <div className={s.typeHeader}>
  702. <div className={s.title}>
  703. {t('datasetCreation.stepTwo.qualified')}
  704. {!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
  705. </div>
  706. <div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
  707. </div>
  708. {!isAPIKeySet && (
  709. <div className={s.warningTip}>
  710. <span>{t('datasetCreation.stepTwo.warning')}&nbsp;</span>
  711. <span className={s.click} onClick={onSetting}>{t('datasetCreation.stepTwo.click')}</span>
  712. </div>
  713. )}
  714. </div>
  715. )}
  716. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  717. <div
  718. className={cn(
  719. s.radioItem,
  720. s.indexItem,
  721. !hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
  722. hasSetIndexType && s.disabled,
  723. hasSetIndexType && '!w-full !min-h-[96px]',
  724. )}
  725. onClick={changeToEconomicalType}
  726. >
  727. <span className={cn(s.typeIcon, s.economical)} />
  728. {!hasSetIndexType && <span className={cn(s.radio)} />}
  729. <div className={s.typeHeader}>
  730. <div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
  731. <div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
  732. </div>
  733. </div>
  734. )}
  735. </div>
  736. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  737. <div className='mt-2 text-xs text-gray-500 font-medium'>
  738. {t('datasetCreation.stepTwo.indexSettingTip')}
  739. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  740. </div>
  741. )}
  742. {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
  743. <div className='mt-3 rounded-xl bg-gray-50 border border-gray-100'>
  744. <div className='flex justify-between items-center px-5 py-4'>
  745. <div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
  746. <MessageChatSquare className='w-4 h-4' />
  747. </div>
  748. <div className='grow mx-3'>
  749. <div className='mb-[2px] text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
  750. <div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
  751. <span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
  752. <LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} disabled={isLanguageSelectDisabled} />
  753. </div>
  754. </div>
  755. <div className='shrink-0'>
  756. <Switch
  757. defaultValue={docForm === DocForm.QA}
  758. onChange={handleSwitch}
  759. size='md'
  760. />
  761. </div>
  762. </div>
  763. {docForm === DocForm.QA && !QATipHide && (
  764. <div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
  765. {t('datasetCreation.stepTwo.QATip')}
  766. <RiCloseLine className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
  767. </div>
  768. )}
  769. </div>
  770. )}
  771. {/* Embedding model */}
  772. {indexType === IndexingType.QUALIFIED && (
  773. <div className='mb-2'>
  774. <div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
  775. <ModelSelector
  776. readonly={!!datasetId}
  777. defaultModel={embeddingModel}
  778. modelList={embeddingModelList}
  779. onSelect={(model: DefaultModel) => {
  780. setEmbeddingModel(model)
  781. }}
  782. />
  783. {!!datasetId && (
  784. <div className='mt-2 text-xs text-gray-500 font-medium'>
  785. {t('datasetCreation.stepTwo.indexSettingTip')}
  786. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  787. </div>
  788. )}
  789. </div>
  790. )}
  791. {/* Retrieval Method Config */}
  792. <div>
  793. {!datasetId
  794. ? (
  795. <div className={s.label}>
  796. <div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  797. <div className='leading-[18px] text-xs font-normal text-gray-500'>
  798. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  799. {t('datasetSettings.form.retrievalSetting.longDescription')}
  800. </div>
  801. </div>
  802. )
  803. : (
  804. <div className={cn(s.label, 'flex justify-between items-center')}>
  805. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  806. </div>
  807. )}
  808. <div className='max-w-[640px]'>
  809. {
  810. getIndexing_technique() === IndexingType.QUALIFIED
  811. ? (
  812. <RetrievalMethodConfig
  813. value={retrievalConfig}
  814. onChange={setRetrievalConfig}
  815. />
  816. )
  817. : (
  818. <EconomicalRetrievalMethodConfig
  819. value={retrievalConfig}
  820. onChange={setRetrievalConfig}
  821. />
  822. )
  823. }
  824. </div>
  825. </div>
  826. <div className={s.source}>
  827. <div className={s.sourceContent}>
  828. {dataSourceType === DataSourceType.FILE && (
  829. <>
  830. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.fileSource')}</div>
  831. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  832. <span className={cn(s.fileIcon, files.length && s[files[0].extension || ''])} />
  833. {getFileName(files[0].name || '')}
  834. {files.length > 1 && (
  835. <span className={s.sourceCount}>
  836. <span>{t('datasetCreation.stepTwo.other')}</span>
  837. <span>{files.length - 1}</span>
  838. <span>{t('datasetCreation.stepTwo.fileUnit')}</span>
  839. </span>
  840. )}
  841. </div>
  842. </>
  843. )}
  844. {dataSourceType === DataSourceType.NOTION && (
  845. <>
  846. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.notionSource')}</div>
  847. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  848. <NotionIcon
  849. className='shrink-0 mr-1'
  850. type='page'
  851. src={notionPages[0]?.page_icon}
  852. />
  853. {notionPages[0]?.page_name}
  854. {notionPages.length > 1 && (
  855. <span className={s.sourceCount}>
  856. <span>{t('datasetCreation.stepTwo.other')}</span>
  857. <span>{notionPages.length - 1}</span>
  858. <span>{t('datasetCreation.stepTwo.notionUnit')}</span>
  859. </span>
  860. )}
  861. </div>
  862. </>
  863. )}
  864. {dataSourceType === DataSourceType.WEB && (
  865. <>
  866. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.websiteSource')}</div>
  867. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  868. <Globe01 className='shrink-0 mr-1' />
  869. <span className='grow w-0 truncate'>{websitePages[0].source_url}</span>
  870. {websitePages.length > 1 && (
  871. <span className={s.sourceCount}>
  872. <span>{t('datasetCreation.stepTwo.other')}</span>
  873. <span>{websitePages.length - 1}</span>
  874. <span>{t('datasetCreation.stepTwo.webpageUnit')}</span>
  875. </span>
  876. )}
  877. </div>
  878. </>
  879. )}
  880. </div>
  881. <div className={s.divider} />
  882. <div className={s.segmentCount}>
  883. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.estimateSegment')}</div>
  884. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  885. {
  886. fileIndexingEstimate
  887. ? (
  888. <div className='text-xs font-medium text-gray-800'>{formatNumber(fileIndexingEstimate.total_segments)} </div>
  889. )
  890. : (
  891. <div className={s.calculating}>{t('datasetCreation.stepTwo.calculating')}</div>
  892. )
  893. }
  894. </div>
  895. </div>
  896. </div>
  897. {!isSetting
  898. ? (
  899. <div className='flex items-center mt-8 py-2'>
  900. <Button onClick={() => onStepChange && onStepChange(-1)}>{t('datasetCreation.stepTwo.previousStep')}</Button>
  901. <div className={s.divider} />
  902. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  903. </div>
  904. )
  905. : (
  906. <div className='flex items-center mt-8 py-2'>
  907. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  908. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  909. </div>
  910. )}
  911. </div>
  912. </div>
  913. </div>
  914. <FloatRightContainer isMobile={isMobile} isOpen={showPreview} onClose={hidePreview} footer={null}>
  915. {showPreview && <div ref={previewScrollRef} className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll border-l border-[#F2F4F7]')}>
  916. <div className={cn(s.previewHeader, previewScrolled && `${s.fixed} pb-3`)}>
  917. <div className='flex items-center justify-between px-8'>
  918. <div className='grow flex items-center'>
  919. <div>{t('datasetCreation.stepTwo.previewTitle')}</div>
  920. {docForm === DocForm.QA && !previewSwitched && (
  921. <Button className='ml-2' variant='secondary-accent' onClick={() => previewSwitch()}>{t('datasetCreation.stepTwo.previewButton')}</Button>
  922. )}
  923. </div>
  924. <div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
  925. <XMarkIcon className='h-4 w-4'></XMarkIcon>
  926. </div>
  927. </div>
  928. {docForm === DocForm.QA && !previewSwitched && (
  929. <div className='px-8 pr-12 text-xs text-gray-500'>
  930. <span>{t('datasetCreation.stepTwo.previewSwitchTipStart')}</span>
  931. <span className='text-amber-600'>{t('datasetCreation.stepTwo.previewSwitchTipEnd')}</span>
  932. </div>
  933. )}
  934. </div>
  935. <div className='my-4 px-8 space-y-4'>
  936. {previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && (
  937. <>
  938. {fileIndexingEstimate?.qa_preview.map((item, index) => (
  939. <PreviewItem type={PreviewType.QA} key={item.question} qa={item} index={index + 1} />
  940. ))}
  941. </>
  942. )}
  943. {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && (
  944. <>
  945. {fileIndexingEstimate?.preview.map((item, index) => (
  946. <PreviewItem type={PreviewType.TEXT} key={item} content={item} index={index + 1} />
  947. ))}
  948. </>
  949. )}
  950. {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
  951. <div className='flex items-center justify-center h-[200px]'>
  952. <Loading type='area' />
  953. </div>
  954. )}
  955. {!previewSwitched && !fileIndexingEstimate?.preview && (
  956. <div className='flex items-center justify-center h-[200px]'>
  957. <Loading type='area' />
  958. </div>
  959. )}
  960. </div>
  961. </div>}
  962. {!showPreview && (
  963. <div className={cn(s.sideTip)}>
  964. <div className={s.tipCard}>
  965. <span className={s.icon} />
  966. <div className={s.title}>{t('datasetCreation.stepTwo.sideTipTitle')}</div>
  967. <div className={s.content}>
  968. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP1')}</p>
  969. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP2')}</p>
  970. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP3')}</p>
  971. <p>{t('datasetCreation.stepTwo.sideTipP4')}</p>
  972. </div>
  973. </div>
  974. </div>
  975. )}
  976. </FloatRightContainer>
  977. </div>
  978. )
  979. }
  980. export default StepTwo