use-metadata.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. 'use client'
  2. import { useTranslation } from 'react-i18next'
  3. import { formatFileSize, formatNumber, formatTime } from '@/utils/format'
  4. import type { DocType } from '@/models/datasets'
  5. import useTimestamp from '@/hooks/use-timestamp'
  6. export type inputType = 'input' | 'select' | 'textarea'
  7. export type metadataType = DocType | 'originInfo' | 'technicalParameters'
  8. type MetadataMap =
  9. Record<
  10. metadataType,
  11. {
  12. text: string
  13. allowEdit?: boolean
  14. icon?: React.ReactNode
  15. iconName?: string
  16. subFieldsMap: Record<
  17. string,
  18. {
  19. label: string
  20. inputType?: inputType
  21. field?: string
  22. render?: (value: any, total?: number) => React.ReactNode | string
  23. }
  24. >
  25. }
  26. >
  27. const fieldPrefix = 'datasetDocuments.metadata.field'
  28. export const useMetadataMap = (): MetadataMap => {
  29. const { t } = useTranslation()
  30. const { formatTime: formatTimestamp } = useTimestamp()
  31. return {
  32. book: {
  33. text: t('datasetDocuments.metadata.type.book'),
  34. iconName: 'bookOpen',
  35. subFieldsMap: {
  36. title: { label: t(`${fieldPrefix}.book.title`) },
  37. language: {
  38. label: t(`${fieldPrefix}.book.language`),
  39. inputType: 'select',
  40. },
  41. author: { label: t(`${fieldPrefix}.book.author`) },
  42. publisher: { label: t(`${fieldPrefix}.book.publisher`) },
  43. publication_date: { label: t(`${fieldPrefix}.book.publicationDate`) },
  44. isbn: { label: t(`${fieldPrefix}.book.ISBN`) },
  45. category: {
  46. label: t(`${fieldPrefix}.book.category`),
  47. inputType: 'select',
  48. },
  49. },
  50. },
  51. web_page: {
  52. text: t('datasetDocuments.metadata.type.webPage'),
  53. iconName: 'globe',
  54. subFieldsMap: {
  55. 'title': { label: t(`${fieldPrefix}.webPage.title`) },
  56. 'url': { label: t(`${fieldPrefix}.webPage.url`) },
  57. 'language': {
  58. label: t(`${fieldPrefix}.webPage.language`),
  59. inputType: 'select',
  60. },
  61. 'author/publisher': { label: t(`${fieldPrefix}.webPage.authorPublisher`) },
  62. 'publish_date': { label: t(`${fieldPrefix}.webPage.publishDate`) },
  63. 'topics/keywords': { label: t(`${fieldPrefix}.webPage.topicsKeywords`) },
  64. 'description': { label: t(`${fieldPrefix}.webPage.description`) },
  65. },
  66. },
  67. paper: {
  68. text: t('datasetDocuments.metadata.type.paper'),
  69. iconName: 'graduationHat',
  70. subFieldsMap: {
  71. 'title': { label: t(`${fieldPrefix}.paper.title`) },
  72. 'language': {
  73. label: t(`${fieldPrefix}.paper.language`),
  74. inputType: 'select',
  75. },
  76. 'author': { label: t(`${fieldPrefix}.paper.author`) },
  77. 'publish_date': { label: t(`${fieldPrefix}.paper.publishDate`) },
  78. 'journal/conference_name': {
  79. label: t(`${fieldPrefix}.paper.journalConferenceName`),
  80. },
  81. 'volume/issue/page_numbers': { label: t(`${fieldPrefix}.paper.volumeIssuePage`) },
  82. 'doi': { label: t(`${fieldPrefix}.paper.DOI`) },
  83. 'topics/keywords': { label: t(`${fieldPrefix}.paper.topicsKeywords`) },
  84. 'abstract': {
  85. label: t(`${fieldPrefix}.paper.abstract`),
  86. inputType: 'textarea',
  87. },
  88. },
  89. },
  90. social_media_post: {
  91. text: t('datasetDocuments.metadata.type.socialMediaPost'),
  92. iconName: 'atSign',
  93. subFieldsMap: {
  94. 'platform': { label: t(`${fieldPrefix}.socialMediaPost.platform`) },
  95. 'author/username': {
  96. label: t(`${fieldPrefix}.socialMediaPost.authorUsername`),
  97. },
  98. 'publish_date': { label: t(`${fieldPrefix}.socialMediaPost.publishDate`) },
  99. 'post_url': { label: t(`${fieldPrefix}.socialMediaPost.postURL`) },
  100. 'topics/tags': { label: t(`${fieldPrefix}.socialMediaPost.topicsTags`) },
  101. },
  102. },
  103. personal_document: {
  104. text: t('datasetDocuments.metadata.type.personalDocument'),
  105. iconName: 'file',
  106. subFieldsMap: {
  107. 'title': { label: t(`${fieldPrefix}.personalDocument.title`) },
  108. 'author': { label: t(`${fieldPrefix}.personalDocument.author`) },
  109. 'creation_date': {
  110. label: t(`${fieldPrefix}.personalDocument.creationDate`),
  111. },
  112. 'last_modified_date': {
  113. label: t(`${fieldPrefix}.personalDocument.lastModifiedDate`),
  114. },
  115. 'document_type': {
  116. label: t(`${fieldPrefix}.personalDocument.documentType`),
  117. inputType: 'select',
  118. },
  119. 'tags/category': {
  120. label: t(`${fieldPrefix}.personalDocument.tagsCategory`),
  121. },
  122. },
  123. },
  124. business_document: {
  125. text: t('datasetDocuments.metadata.type.businessDocument'),
  126. iconName: 'briefcase',
  127. subFieldsMap: {
  128. 'title': { label: t(`${fieldPrefix}.businessDocument.title`) },
  129. 'author': { label: t(`${fieldPrefix}.businessDocument.author`) },
  130. 'creation_date': {
  131. label: t(`${fieldPrefix}.businessDocument.creationDate`),
  132. },
  133. 'last_modified_date': {
  134. label: t(`${fieldPrefix}.businessDocument.lastModifiedDate`),
  135. },
  136. 'document_type': {
  137. label: t(`${fieldPrefix}.businessDocument.documentType`),
  138. inputType: 'select',
  139. },
  140. 'department/team': {
  141. label: t(`${fieldPrefix}.businessDocument.departmentTeam`),
  142. },
  143. },
  144. },
  145. im_chat_log: {
  146. text: t('datasetDocuments.metadata.type.IMChat'),
  147. iconName: 'messageTextCircle',
  148. subFieldsMap: {
  149. 'chat_platform': { label: t(`${fieldPrefix}.IMChat.chatPlatform`) },
  150. 'chat_participants/group_name': {
  151. label: t(`${fieldPrefix}.IMChat.chatPartiesGroupName`),
  152. },
  153. 'start_date': { label: t(`${fieldPrefix}.IMChat.startDate`) },
  154. 'end_date': { label: t(`${fieldPrefix}.IMChat.endDate`) },
  155. 'participants': { label: t(`${fieldPrefix}.IMChat.participants`) },
  156. 'topicsKeywords': {
  157. label: t(`${fieldPrefix}.IMChat.topicsKeywords`),
  158. inputType: 'textarea',
  159. },
  160. 'fileType': { label: t(`${fieldPrefix}.IMChat.fileType`) },
  161. },
  162. },
  163. wikipedia_entry: {
  164. text: t('datasetDocuments.metadata.type.wikipediaEntry'),
  165. allowEdit: false,
  166. subFieldsMap: {
  167. 'title': { label: t(`${fieldPrefix}.wikipediaEntry.title`) },
  168. 'language': {
  169. label: t(`${fieldPrefix}.wikipediaEntry.language`),
  170. inputType: 'select',
  171. },
  172. 'web_page_url': { label: t(`${fieldPrefix}.wikipediaEntry.webpageURL`) },
  173. 'editor/contributor': {
  174. label: t(`${fieldPrefix}.wikipediaEntry.editorContributor`),
  175. },
  176. 'last_edit_date': {
  177. label: t(`${fieldPrefix}.wikipediaEntry.lastEditDate`),
  178. },
  179. 'summary/introduction': {
  180. label: t(`${fieldPrefix}.wikipediaEntry.summaryIntroduction`),
  181. inputType: 'textarea',
  182. },
  183. },
  184. },
  185. synced_from_notion: {
  186. text: t('datasetDocuments.metadata.type.notion'),
  187. allowEdit: false,
  188. subFieldsMap: {
  189. 'title': { label: t(`${fieldPrefix}.notion.title`) },
  190. 'language': { label: t(`${fieldPrefix}.notion.lang`), inputType: 'select' },
  191. 'author/creator': { label: t(`${fieldPrefix}.notion.author`) },
  192. 'creation_date': { label: t(`${fieldPrefix}.notion.createdTime`) },
  193. 'last_modified_date': {
  194. label: t(`${fieldPrefix}.notion.lastModifiedTime`),
  195. },
  196. 'notion_page_link': { label: t(`${fieldPrefix}.notion.url`) },
  197. 'category/tags': { label: t(`${fieldPrefix}.notion.tag`) },
  198. 'description': { label: t(`${fieldPrefix}.notion.desc`) },
  199. },
  200. },
  201. synced_from_github: {
  202. text: t('datasetDocuments.metadata.type.github'),
  203. allowEdit: false,
  204. subFieldsMap: {
  205. 'repository_name': { label: t(`${fieldPrefix}.github.repoName`) },
  206. 'repository_description': { label: t(`${fieldPrefix}.github.repoDesc`) },
  207. 'repository_owner/organization': { label: t(`${fieldPrefix}.github.repoOwner`) },
  208. 'code_filename': { label: t(`${fieldPrefix}.github.fileName`) },
  209. 'code_file_path': { label: t(`${fieldPrefix}.github.filePath`) },
  210. 'programming_language': { label: t(`${fieldPrefix}.github.programmingLang`) },
  211. 'github_link': { label: t(`${fieldPrefix}.github.url`) },
  212. 'open_source_license': { label: t(`${fieldPrefix}.github.license`) },
  213. 'commit_date': { label: t(`${fieldPrefix}.github.lastCommitTime`) },
  214. 'commit_author': {
  215. label: t(`${fieldPrefix}.github.lastCommitAuthor`),
  216. },
  217. },
  218. },
  219. originInfo: {
  220. text: '',
  221. allowEdit: false,
  222. subFieldsMap: {
  223. 'name': { label: t(`${fieldPrefix}.originInfo.originalFilename`) },
  224. 'data_source_info.upload_file.size': {
  225. label: t(`${fieldPrefix}.originInfo.originalFileSize`),
  226. render: value => formatFileSize(value),
  227. },
  228. 'created_at': {
  229. label: t(`${fieldPrefix}.originInfo.uploadDate`),
  230. render: value => formatTimestamp(value, t('datasetDocuments.metadata.dateTimeFormat') as string),
  231. },
  232. 'completed_at': {
  233. label: t(`${fieldPrefix}.originInfo.lastUpdateDate`),
  234. render: value => formatTimestamp(value, t('datasetDocuments.metadata.dateTimeFormat') as string),
  235. },
  236. 'data_source_type': {
  237. label: t(`${fieldPrefix}.originInfo.source`),
  238. render: value => t(`datasetDocuments.metadata.source.${value}`),
  239. },
  240. },
  241. },
  242. technicalParameters: {
  243. text: t('datasetDocuments.metadata.type.technicalParameters'),
  244. allowEdit: false,
  245. subFieldsMap: {
  246. 'dataset_process_rule.mode': {
  247. label: t(`${fieldPrefix}.technicalParameters.segmentSpecification`),
  248. render: value => value === 'automatic' ? (t('datasetDocuments.embedding.automatic') as string) : (t('datasetDocuments.embedding.custom') as string),
  249. },
  250. 'dataset_process_rule.rules.segmentation.max_tokens': {
  251. label: t(`${fieldPrefix}.technicalParameters.segmentLength`),
  252. render: value => formatNumber(value),
  253. },
  254. 'average_segment_length': {
  255. label: t(`${fieldPrefix}.technicalParameters.avgParagraphLength`),
  256. render: value => `${formatNumber(value)} characters`,
  257. },
  258. 'segment_count': {
  259. label: t(`${fieldPrefix}.technicalParameters.paragraphs`),
  260. render: value => `${formatNumber(value)} paragraphs`,
  261. },
  262. 'hit_count': {
  263. label: t(`${fieldPrefix}.technicalParameters.hitCount`),
  264. render: (value, total) => {
  265. const v = value || 0
  266. return `${!total ? 0 : ((v / total) * 100).toFixed(2)}% (${v}/${total})`
  267. },
  268. },
  269. 'indexing_latency': {
  270. label: t(`${fieldPrefix}.technicalParameters.embeddingTime`),
  271. render: value => formatTime(value),
  272. },
  273. 'tokens': {
  274. label: t(`${fieldPrefix}.technicalParameters.embeddedSpend`),
  275. render: value => `${formatNumber(value)} tokens`,
  276. },
  277. },
  278. },
  279. }
  280. }
  281. const langPrefix = 'datasetDocuments.metadata.languageMap.'
  282. export const useLanguages = () => {
  283. const { t } = useTranslation()
  284. return {
  285. zh: t(`${langPrefix}zh`),
  286. en: t(`${langPrefix}en`),
  287. es: t(`${langPrefix}es`),
  288. fr: t(`${langPrefix}fr`),
  289. de: t(`${langPrefix}de`),
  290. ja: t(`${langPrefix}ja`),
  291. ko: t(`${langPrefix}ko`),
  292. ru: t(`${langPrefix}ru`),
  293. ar: t(`${langPrefix}ar`),
  294. pt: t(`${langPrefix}pt`),
  295. it: t(`${langPrefix}it`),
  296. nl: t(`${langPrefix}nl`),
  297. pl: t(`${langPrefix}pl`),
  298. sv: t(`${langPrefix}sv`),
  299. tr: t(`${langPrefix}tr`),
  300. he: t(`${langPrefix}he`),
  301. hi: t(`${langPrefix}hi`),
  302. da: t(`${langPrefix}da`),
  303. fi: t(`${langPrefix}fi`),
  304. no: t(`${langPrefix}no`),
  305. hu: t(`${langPrefix}hu`),
  306. el: t(`${langPrefix}el`),
  307. cs: t(`${langPrefix}cs`),
  308. th: t(`${langPrefix}th`),
  309. id: t(`${langPrefix}id`),
  310. ro: t(`${langPrefix}ro`),
  311. }
  312. }
  313. const bookCategoryPrefix = 'datasetDocuments.metadata.categoryMap.book.'
  314. export const useBookCategories = () => {
  315. const { t } = useTranslation()
  316. return {
  317. fiction: t(`${bookCategoryPrefix}fiction`),
  318. biography: t(`${bookCategoryPrefix}biography`),
  319. history: t(`${bookCategoryPrefix}history`),
  320. science: t(`${bookCategoryPrefix}science`),
  321. technology: t(`${bookCategoryPrefix}technology`),
  322. education: t(`${bookCategoryPrefix}education`),
  323. philosophy: t(`${bookCategoryPrefix}philosophy`),
  324. religion: t(`${bookCategoryPrefix}religion`),
  325. socialSciences: t(`${bookCategoryPrefix}socialSciences`),
  326. art: t(`${bookCategoryPrefix}art`),
  327. travel: t(`${bookCategoryPrefix}travel`),
  328. health: t(`${bookCategoryPrefix}health`),
  329. selfHelp: t(`${bookCategoryPrefix}selfHelp`),
  330. businessEconomics: t(`${bookCategoryPrefix}businessEconomics`),
  331. cooking: t(`${bookCategoryPrefix}cooking`),
  332. childrenYoungAdults: t(`${bookCategoryPrefix}childrenYoungAdults`),
  333. comicsGraphicNovels: t(`${bookCategoryPrefix}comicsGraphicNovels`),
  334. poetry: t(`${bookCategoryPrefix}poetry`),
  335. drama: t(`${bookCategoryPrefix}drama`),
  336. other: t(`${bookCategoryPrefix}other`),
  337. }
  338. }
  339. const personalDocCategoryPrefix
  340. = 'datasetDocuments.metadata.categoryMap.personalDoc.'
  341. export const usePersonalDocCategories = () => {
  342. const { t } = useTranslation()
  343. return {
  344. notes: t(`${personalDocCategoryPrefix}notes`),
  345. blogDraft: t(`${personalDocCategoryPrefix}blogDraft`),
  346. diary: t(`${personalDocCategoryPrefix}diary`),
  347. researchReport: t(`${personalDocCategoryPrefix}researchReport`),
  348. bookExcerpt: t(`${personalDocCategoryPrefix}bookExcerpt`),
  349. schedule: t(`${personalDocCategoryPrefix}schedule`),
  350. list: t(`${personalDocCategoryPrefix}list`),
  351. projectOverview: t(`${personalDocCategoryPrefix}projectOverview`),
  352. photoCollection: t(`${personalDocCategoryPrefix}photoCollection`),
  353. creativeWriting: t(`${personalDocCategoryPrefix}creativeWriting`),
  354. codeSnippet: t(`${personalDocCategoryPrefix}codeSnippet`),
  355. designDraft: t(`${personalDocCategoryPrefix}designDraft`),
  356. personalResume: t(`${personalDocCategoryPrefix}personalResume`),
  357. other: t(`${personalDocCategoryPrefix}other`),
  358. }
  359. }
  360. const businessDocCategoryPrefix
  361. = 'datasetDocuments.metadata.categoryMap.businessDoc.'
  362. export const useBusinessDocCategories = () => {
  363. const { t } = useTranslation()
  364. return {
  365. meetingMinutes: t(`${businessDocCategoryPrefix}meetingMinutes`),
  366. researchReport: t(`${businessDocCategoryPrefix}researchReport`),
  367. proposal: t(`${businessDocCategoryPrefix}proposal`),
  368. employeeHandbook: t(`${businessDocCategoryPrefix}employeeHandbook`),
  369. trainingMaterials: t(`${businessDocCategoryPrefix}trainingMaterials`),
  370. requirementsDocument: t(`${businessDocCategoryPrefix}requirementsDocument`),
  371. designDocument: t(`${businessDocCategoryPrefix}designDocument`),
  372. productSpecification: t(`${businessDocCategoryPrefix}productSpecification`),
  373. financialReport: t(`${businessDocCategoryPrefix}financialReport`),
  374. marketAnalysis: t(`${businessDocCategoryPrefix}marketAnalysis`),
  375. projectPlan: t(`${businessDocCategoryPrefix}projectPlan`),
  376. teamStructure: t(`${businessDocCategoryPrefix}teamStructure`),
  377. policiesProcedures: t(`${businessDocCategoryPrefix}policiesProcedures`),
  378. contractsAgreements: t(`${businessDocCategoryPrefix}contractsAgreements`),
  379. emailCorrespondence: t(`${businessDocCategoryPrefix}emailCorrespondence`),
  380. other: t(`${businessDocCategoryPrefix}other`),
  381. }
  382. }