datasets.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. export enum DataSourceType {
  5. FILE = 'upload_file',
  6. NOTION = 'notion_import',
  7. WEB = 'website_crawl',
  8. }
  9. export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
  10. export type DataSet = {
  11. id: string
  12. name: string
  13. icon: string
  14. icon_background: string
  15. description: string
  16. permission: DatasetPermission
  17. data_source_type: DataSourceType
  18. indexing_technique: 'high_quality' | 'economy'
  19. created_by: string
  20. updated_by: string
  21. updated_at: number
  22. app_count: number
  23. document_count: number
  24. word_count: number
  25. provider: string
  26. embedding_model: string
  27. embedding_model_provider: string
  28. embedding_available: boolean
  29. retrieval_model_dict: RetrievalConfig
  30. retrieval_model: RetrievalConfig
  31. tags: Tag[]
  32. partial_member_list?: any[]
  33. external_knowledge_info: {
  34. external_knowledge_id: string
  35. external_knowledge_api_id: string
  36. external_knowledge_api_name: string
  37. external_knowledge_api_endpoint: string
  38. }
  39. external_retrieval_model: {
  40. top_k: number
  41. score_threshold: number
  42. score_threshold_enabled: boolean
  43. }
  44. }
  45. export type ExternalAPIItem = {
  46. id: string
  47. tenant_id: string
  48. name: string
  49. description: string
  50. settings: {
  51. endpoint: string
  52. api_key: string
  53. }
  54. dataset_bindings: { id: string; name: string }[]
  55. created_by: string
  56. created_at: string
  57. }
  58. export type ExternalKnowledgeItem = {
  59. id: string
  60. name: string
  61. description: string | null
  62. provider: 'external'
  63. permission: DatasetPermission
  64. data_source_type: null
  65. indexing_technique: null
  66. app_count: number
  67. document_count: number
  68. word_count: number
  69. created_by: string
  70. created_at: string
  71. updated_by: string
  72. updated_at: string
  73. tags: Tag[]
  74. }
  75. export type ExternalAPIDeleteResponse = {
  76. result: 'success' | 'error'
  77. }
  78. export type ExternalAPIUsage = {
  79. is_using: boolean
  80. count: number
  81. }
  82. export type CustomFile = File & {
  83. id?: string
  84. extension?: string
  85. mime_type?: string
  86. created_by?: string
  87. created_at?: number
  88. }
  89. export type CrawlOptions = {
  90. crawl_sub_pages: boolean
  91. only_main_content: boolean
  92. includes: string
  93. excludes: string
  94. limit: number | string
  95. max_depth: number | string
  96. use_sitemap: boolean
  97. }
  98. export type CrawlResultItem = {
  99. title: string
  100. markdown: string
  101. description: string
  102. source_url: string
  103. }
  104. export type FileItem = {
  105. fileID: string
  106. file: CustomFile
  107. progress: number
  108. }
  109. export type DataSetListResponse = {
  110. data: DataSet[]
  111. has_more: boolean
  112. limit: number
  113. page: number
  114. total: number
  115. }
  116. export type ExternalAPIListResponse = {
  117. data: ExternalAPIItem[]
  118. has_more: boolean
  119. limit: number
  120. page: number
  121. total: number
  122. }
  123. export type QA = {
  124. question: string
  125. answer: string
  126. }
  127. export type IndexingEstimateResponse = {
  128. tokens: number
  129. total_price: number
  130. currency: string
  131. total_segments: number
  132. preview: string[]
  133. qa_preview?: QA[]
  134. }
  135. export type FileIndexingEstimateResponse = {
  136. total_nodes: number
  137. } & IndexingEstimateResponse
  138. export type IndexingStatusResponse = {
  139. id: string
  140. indexing_status: DocumentIndexingStatus
  141. processing_started_at: number
  142. parsing_completed_at: number
  143. cleaning_completed_at: number
  144. splitting_completed_at: number
  145. completed_at: any
  146. paused_at: any
  147. error: any
  148. stopped_at: any
  149. completed_segments: number
  150. total_segments: number
  151. }
  152. export type IndexingStatusBatchResponse = {
  153. data: IndexingStatusResponse[]
  154. }
  155. export type ProcessMode = 'automatic' | 'custom'
  156. export type ProcessRuleResponse = {
  157. mode: ProcessMode
  158. rules: Rules
  159. limits: Limits
  160. }
  161. export type Rules = {
  162. pre_processing_rules: PreProcessingRule[]
  163. segmentation: Segmentation
  164. }
  165. export type Limits = {
  166. indexing_max_segmentation_tokens_length: number
  167. }
  168. export type PreProcessingRule = {
  169. id: string
  170. enabled: boolean
  171. }
  172. export type Segmentation = {
  173. separator: string
  174. max_tokens: number
  175. chunk_overlap: number
  176. }
  177. export const DocumentIndexingStatusList = [
  178. 'waiting',
  179. 'parsing',
  180. 'cleaning',
  181. 'splitting',
  182. 'indexing',
  183. 'paused',
  184. 'error',
  185. 'completed',
  186. ] as const
  187. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  188. export const DisplayStatusList = [
  189. 'queuing',
  190. 'indexing',
  191. 'paused',
  192. 'error',
  193. 'available',
  194. 'enabled',
  195. 'disabled',
  196. 'archived',
  197. ] as const
  198. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  199. export type DataSourceInfo = {
  200. upload_file: {
  201. id: string
  202. name: string
  203. size: number
  204. mime_type: string
  205. created_at: number
  206. created_by: string
  207. extension: string
  208. }
  209. notion_page_icon?: string
  210. notion_workspace_id?: string
  211. notion_page_id?: string
  212. provider?: DataSourceProvider
  213. job_id: string
  214. url: string
  215. }
  216. export type InitialDocumentDetail = {
  217. id: string
  218. batch: string
  219. position: number
  220. dataset_id: string
  221. data_source_type: DataSourceType
  222. data_source_info: DataSourceInfo
  223. dataset_process_rule_id: string
  224. name: string
  225. created_from: 'api' | 'web'
  226. created_by: string
  227. created_at: number
  228. indexing_status: DocumentIndexingStatus
  229. display_status: DocumentDisplayStatus
  230. completed_segments?: number
  231. total_segments?: number
  232. doc_form: 'text_model' | 'qa_model'
  233. doc_language: string
  234. }
  235. export type SimpleDocumentDetail = InitialDocumentDetail & {
  236. enabled: boolean
  237. word_count: number
  238. error?: string | null
  239. archived: boolean
  240. updated_at: number
  241. hit_count: number
  242. dataset_process_rule_id?: string
  243. data_source_detail_dict?: {
  244. upload_file: {
  245. name: string
  246. extension: string
  247. }
  248. }
  249. }
  250. export type DocumentListResponse = {
  251. data: SimpleDocumentDetail[]
  252. has_more: boolean
  253. total: number
  254. page: number
  255. limit: number
  256. }
  257. export type DocumentReq = {
  258. original_document_id?: string
  259. indexing_technique?: string
  260. doc_form: 'text_model' | 'qa_model'
  261. doc_language: string
  262. process_rule: ProcessRule
  263. }
  264. export type CreateDocumentReq = DocumentReq & {
  265. data_source: DataSource
  266. retrieval_model: RetrievalConfig
  267. embedding_model: string
  268. embedding_model_provider: string
  269. }
  270. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  271. dataset_id: string
  272. }
  273. export type DataSource = {
  274. type: DataSourceType
  275. info_list: {
  276. data_source_type: DataSourceType
  277. notion_info_list?: NotionInfo[]
  278. file_info_list?: {
  279. file_ids: string[]
  280. }
  281. website_info_list?: {
  282. provider: string
  283. job_id: string
  284. urls: string[]
  285. }
  286. }
  287. }
  288. export type NotionInfo = {
  289. workspace_id: string
  290. pages: DataSourceNotionPage[]
  291. }
  292. export type NotionPage = {
  293. page_id: string
  294. type: string
  295. }
  296. export type ProcessRule = {
  297. mode: string
  298. rules: Rules
  299. }
  300. export type createDocumentResponse = {
  301. dataset?: DataSet
  302. batch: string
  303. documents: InitialDocumentDetail[]
  304. }
  305. export type FullDocumentDetail = SimpleDocumentDetail & {
  306. batch: string
  307. created_api_request_id: string
  308. processing_started_at: number
  309. parsing_completed_at: number
  310. cleaning_completed_at: number
  311. splitting_completed_at: number
  312. tokens: number
  313. indexing_latency: number
  314. completed_at: number
  315. paused_by: string
  316. paused_at: number
  317. stopped_at: number
  318. indexing_status: string
  319. disabled_at: number
  320. disabled_by: string
  321. archived_reason: 'rule_modified' | 're_upload'
  322. archived_by: string
  323. archived_at: number
  324. doc_type?: DocType | null | 'others'
  325. doc_metadata?: DocMetadata | null
  326. segment_count: number
  327. [key: string]: any
  328. }
  329. export type DocMetadata = {
  330. title: string
  331. language: string
  332. author: string
  333. publisher: string
  334. publicationDate: string
  335. ISBN: string
  336. category: string
  337. [key: string]: string
  338. }
  339. export const CUSTOMIZABLE_DOC_TYPES = [
  340. 'book',
  341. 'web_page',
  342. 'paper',
  343. 'social_media_post',
  344. 'personal_document',
  345. 'business_document',
  346. 'im_chat_log',
  347. ] as const
  348. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  349. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  350. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  351. export type DocType = CustomizableDocType | FixedDocType
  352. export type DocumentDetailResponse = FullDocumentDetail
  353. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  354. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  355. export type SegmentsQuery = {
  356. last_id?: string
  357. limit: number
  358. // status?: SegmentStatus
  359. hit_count_gte?: number
  360. keyword?: string
  361. enabled?: boolean
  362. }
  363. export type SegmentDetailModel = {
  364. id: string
  365. position: number
  366. document_id: string
  367. content: string
  368. word_count: number
  369. tokens: number
  370. keywords: string[]
  371. index_node_id: string
  372. index_node_hash: string
  373. hit_count: number
  374. enabled: boolean
  375. disabled_at: number
  376. disabled_by: string
  377. status: SegmentStatus
  378. created_by: string
  379. created_at: number
  380. indexing_at: number
  381. completed_at: number
  382. error: string | null
  383. stopped_at: number
  384. answer?: string
  385. }
  386. export type SegmentsResponse = {
  387. data: SegmentDetailModel[]
  388. has_more: boolean
  389. limit: number
  390. total: number
  391. }
  392. export type HitTestingRecord = {
  393. id: string
  394. content: string
  395. source: 'app' | 'hit_testing' | 'plugin'
  396. source_app_id: string
  397. created_by_role: 'account' | 'end_user'
  398. created_by: string
  399. created_at: number
  400. }
  401. export type HitTesting = {
  402. segment: Segment
  403. score: number
  404. tsne_position: TsnePosition
  405. }
  406. export type ExternalKnowledgeBaseHitTesting = {
  407. content: string
  408. title: string
  409. score: number
  410. metadata: {
  411. 'x-amz-bedrock-kb-source-uri': string
  412. 'x-amz-bedrock-kb-data-source-id': string
  413. }
  414. }
  415. export type Segment = {
  416. id: string
  417. document: Document
  418. content: string
  419. position: number
  420. word_count: number
  421. tokens: number
  422. keywords: string[]
  423. hit_count: number
  424. index_node_hash: string
  425. }
  426. export type Document = {
  427. id: string
  428. data_source_type: string
  429. name: string
  430. doc_type: DocType
  431. }
  432. export type HitTestingRecordsResponse = {
  433. data: HitTestingRecord[]
  434. has_more: boolean
  435. limit: number
  436. total: number
  437. page: number
  438. }
  439. export type TsnePosition = {
  440. x: number
  441. y: number
  442. }
  443. export type HitTestingResponse = {
  444. query: {
  445. content: string
  446. tsne_position: TsnePosition
  447. }
  448. records: Array<HitTesting>
  449. }
  450. export type ExternalKnowledgeBaseHitTestingResponse = {
  451. query: {
  452. content: string
  453. }
  454. records: Array<ExternalKnowledgeBaseHitTesting>
  455. }
  456. export type RelatedApp = {
  457. id: string
  458. name: string
  459. mode: AppMode
  460. icon_type: AppIconType | null
  461. icon: string
  462. icon_background: string
  463. icon_url: string
  464. }
  465. export type RelatedAppResponse = {
  466. data: Array<RelatedApp>
  467. total: number
  468. }
  469. export type SegmentUpdater = {
  470. content: string
  471. answer?: string
  472. keywords?: string[]
  473. }
  474. export enum DocForm {
  475. TEXT = 'text_model',
  476. QA = 'qa_model',
  477. }
  478. export type ErrorDocsResponse = {
  479. data: IndexingStatusResponse[]
  480. total: number
  481. }
  482. export type SelectedDatasetsMode = {
  483. allHighQuality: boolean
  484. allHighQualityVectorSearch: boolean
  485. allHighQualityFullTextSearch: boolean
  486. allEconomic: boolean
  487. mixtureHighQualityAndEconomic: boolean
  488. allInternal: boolean
  489. allExternal: boolean
  490. mixtureInternalAndExternal: boolean
  491. inconsistentEmbeddingModel: boolean
  492. }
  493. export enum WeightedScoreEnum {
  494. SemanticFirst = 'semantic_first',
  495. KeywordFirst = 'keyword_first',
  496. Customized = 'customized',
  497. }
  498. export enum RerankingModeEnum {
  499. RerankingModel = 'reranking_model',
  500. WeightedScore = 'weighted_score',
  501. }
  502. export const DEFAULT_WEIGHTED_SCORE = {
  503. allHighQualityVectorSearch: {
  504. semantic: 1.0,
  505. keyword: 0,
  506. },
  507. allHighQualityFullTextSearch: {
  508. semantic: 0,
  509. keyword: 1.0,
  510. },
  511. other: {
  512. semantic: 0.7,
  513. keyword: 0.3,
  514. },
  515. }