diff --git a/web/app/components/datasets/create/website/base/header.tsx b/web/app/components/datasets/create/website/base/header.tsx index dc6191d78f..6400e9475a 100644 --- a/web/app/components/datasets/create/website/base/header.tsx +++ b/web/app/components/datasets/create/website/base/header.tsx @@ -23,7 +23,7 @@ const Header = ({ }: HeaderProps) => { return (
-
+
- - {docTitle} + + {docTitle}
) diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/options.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options.tsx similarity index 98% rename from web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/options.tsx rename to web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options.tsx index 2fb90ffbe8..f1d24d4838 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/options.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options.tsx @@ -9,8 +9,8 @@ import { RiPlayLargeLine } from '@remixicon/react' import { useBoolean } from 'ahooks' import { useEffect } from 'react' import { useTranslation } from 'react-i18next' -import { useSchema } from './hooks' import Toast from '@/app/components/base/toast' +import type { ZodSchema } from 'zod' const I18N_PREFIX = 'datasetCreation.stepOne.website' @@ -23,6 +23,7 @@ type OptionsProps = { configurations: BaseConfiguration[] isRunning: boolean controlFoldOptions?: number + schema: ZodSchema onSubmit: (data: FormData) => void } @@ -31,9 +32,9 @@ const Options = ({ configurations, isRunning, controlFoldOptions, + schema, onSubmit, }: OptionsProps) => { - const schema = useSchema() const form = useAppForm({ defaultValues: initialData, validators: { diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts index db7562951e..777c568356 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts @@ -1,7 +1,7 @@ import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types' import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types' import { useTranslation } from 'react-i18next' -import type { FormData } from './options' +import type { FormData } from '../base/options' import { z } from 'zod' const ERROR_I18N_PREFIX = 'common.errorMsg' @@ -22,7 +22,7 @@ export const useConfigurations = () => { type: BaseFieldType.numberInput, variable: 'limit', label: t(`${I18N_PREFIX}.limit`), - required: false, + required: true, showConditions: [], }, { @@ -31,6 +31,7 @@ export const useConfigurations = () => { label: t(`${I18N_PREFIX}.maxDepth`), required: false, showConditions: [], + tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`), }, { type: BaseFieldType.textInput, diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx index b67a247183..9ec42da91a 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx @@ -1,5 +1,4 @@ 'use client' -import type { FC } from 'react' import React, { useCallback, useEffect, useState } from 'react' import { useTranslation } from 'react-i18next' import { useModalContextSelector } from '@/context/modal-context' @@ -7,16 +6,16 @@ import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' import { checkFirecrawlTaskStatus, createFirecrawlTask } from '@/service/datasets' import { sleep } from '@/utils' import Header from '@/app/components/datasets/create/website/base/header' -import type { FormData } from './options' -import Options from './options' -import { useConfigurations } from './hooks' +import type { FormData } from '../base/options' +import Options from '../base/options' +import { useConfigurations, useSchema } from './hooks' import Crawling from '../base/crawling' import ErrorMessage from '../base/error-message' import CrawledResult from '../base/crawled-result' const I18N_PREFIX = 'datasetCreation.stepOne.website' -type Props = { +type FireCrawlProps = { checkedCrawlResult: CrawlResultItem[] onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void onJobIdChange: (jobId: string) => void @@ -30,17 +29,19 @@ enum Step { finished = 'finished', } -const FireCrawl: FC = ({ +const FireCrawl = ({ checkedCrawlResult, onCheckedCrawlResultChange, onJobIdChange, crawlOptions, onCrawlOptionsChange, -}) => { +}: FireCrawlProps) => { const { t } = useTranslation() const [step, setStep] = useState(Step.init) const [controlFoldOptions, setControlFoldOptions] = useState(0) const configurations = useConfigurations() + const schema = useSchema() + useEffect(() => { if (step !== Step.init) setControlFoldOptions(Date.now()) @@ -163,6 +164,7 @@ const FireCrawl: FC = ({ configurations={configurations} isRunning={isRunning} controlFoldOptions={controlFoldOptions} + schema={schema} onSubmit={(value) => { handleRun(value) console.log('submit') diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/hooks.ts new file mode 100644 index 0000000000..f1b2562427 --- /dev/null +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/hooks.ts @@ -0,0 +1,67 @@ +import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types' +import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types' +import { useTranslation } from 'react-i18next' +import type { FormData } from '../base/options' +import { z } from 'zod' + +const ERROR_I18N_PREFIX = 'common.errorMsg' +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +export const useConfigurations = () => { + const { t } = useTranslation() + const configurations: BaseConfiguration[] = [ + { + type: BaseFieldType.textInput, + variable: 'url', + label: 'URL', + required: true, + showConditions: [], + placeholder: 'https://docs.dify.ai', + }, + { + type: BaseFieldType.numberInput, + variable: 'limit', + label: t(`${I18N_PREFIX}.limit`), + required: true, + showConditions: [], + }, + { + type: BaseFieldType.checkbox, + variable: 'crawl_sub_pages', + label: t(`${I18N_PREFIX}.crawlSubPage`), + required: false, + showConditions: [], + }, + { + type: BaseFieldType.checkbox, + variable: 'use_sitemap', + label: t(`${I18N_PREFIX}.useSitemap`), + tooltip: t(`${I18N_PREFIX}.useSitemapTooltip`), + required: false, + showConditions: [], + }, + ] + + return configurations +} + +export const useSchema = () => { + const { t } = useTranslation() + + const Schema = z.object({ + url: z.string().nonempty({ + message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: 'url', + }), + }).regex(/^https?:\/\//, { + message: t(`${ERROR_I18N_PREFIX}.urlError`), + }), + limit: z.number().positive({ + message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: t(`${I18N_PREFIX}.limit`), + }), + }).int(), + }).passthrough() + + return Schema +} diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/index.tsx new file mode 100644 index 0000000000..d92fb1f39f --- /dev/null +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/index.tsx @@ -0,0 +1,216 @@ +'use client' +import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import CrawledResult from '../base/crawled-result' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' +import { useModalContextSelector } from '@/context/modal-context' +import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets' +import { sleep } from '@/utils' +import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' +import Header from '@/app/components/datasets/create/website/base/header' +import type { FormData } from '../base/options' +import Options from '../base/options' +import { useConfigurations, useSchema } from './hooks' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type JinaReaderProps = { + checkedCrawlResult: CrawlResultItem[] + onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void +} + +enum Step { + init = 'init', + running = 'running', + finished = 'finished', +} + +const JinaReader = ({ + checkedCrawlResult, + onCheckedCrawlResultChange, + onJobIdChange, + crawlOptions, + onCrawlOptionsChange, +}: JinaReaderProps) => { + const { t } = useTranslation() + const [step, setStep] = useState(Step.init) + const [controlFoldOptions, setControlFoldOptions] = useState(0) + const configurations = useConfigurations() + const schema = useSchema() + + useEffect(() => { + if (step !== Step.init) + setControlFoldOptions(Date.now()) + }, [step]) + + const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal) + const handleSetting = useCallback(() => { + setShowAccountSettingModal({ + payload: 'data-source', + }) + }, [setShowAccountSettingModal]) + + const isInit = step === Step.init + const isCrawlFinished = step === Step.finished + const isRunning = step === Step.running + const [crawlResult, setCrawlResult] = useState<{ + current: number + total: number + data: CrawlResultItem[] + time_consuming: number | string + } | undefined>(undefined) + const [crawlErrorMessage, setCrawlErrorMessage] = useState('') + const showError = isCrawlFinished && crawlErrorMessage + + const waitForCrawlFinished = useCallback(async (jobId: string) => { + try { + const res = await checkJinaReaderTaskStatus(jobId) as any + if (res.status === 'completed') { + return { + isError: false, + data: { + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }, + } + } + if (res.status === 'failed' || !res.status) { + return { + isError: true, + errorMessage: res.message, + data: { + data: [], + }, + } + } + // update the progress + setCrawlResult({ + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }) + onCheckedCrawlResultChange(res.data || []) // default select the crawl result + await sleep(2500) + return await waitForCrawlFinished(jobId) + } + catch (e: any) { + const errorBody = await e.json() + return { + isError: true, + errorMessage: errorBody.message, + data: { + data: [], + }, + } + } + }, [crawlOptions.limit, onCheckedCrawlResultChange]) + + const handleRun = useCallback(async (value: FormData) => { + const { url, ...crawlOptions } = value + onCrawlOptionsChange(crawlOptions) + setStep(Step.running) + try { + const startTime = Date.now() + const res = await createJinaReaderTask({ + url, + options: crawlOptions, + }) as any + + if (res.data) { + const data = { + current: 1, + total: 1, + data: [{ + title: res.data.title, + markdown: res.data.content, + description: res.data.description, + source_url: res.data.url, + }], + time_consuming: (Date.now() - startTime) / 1000, + } + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) + setCrawlErrorMessage('') + } + else if (res.job_id) { + const jobId = res.job_id + onJobIdChange(jobId) + const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) + if (isError) { + setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) + } + else { + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) // default select the crawl result + setCrawlErrorMessage('') + } + } + } + catch (e) { + setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) + console.log(e) + } + finally { + setStep(Step.finished) + } + }, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished]) + + return ( +
+
+
+ { + handleRun(value) + console.log('submit') + }} + /> +
+ {!isInit && ( +
+ {isRunning && ( + + )} + {showError && ( + + )} + {isCrawlFinished && !showError && ( + + )} +
+ )} +
+ ) +} +export default React.memo(JinaReader) diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/hooks.ts new file mode 100644 index 0000000000..777c568356 --- /dev/null +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/hooks.ts @@ -0,0 +1,90 @@ +import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types' +import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types' +import { useTranslation } from 'react-i18next' +import type { FormData } from '../base/options' +import { z } from 'zod' + +const ERROR_I18N_PREFIX = 'common.errorMsg' +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +export const useConfigurations = () => { + const { t } = useTranslation() + const configurations: BaseConfiguration[] = [ + { + type: BaseFieldType.textInput, + variable: 'url', + label: 'URL', + required: true, + showConditions: [], + placeholder: 'https://docs.dify.ai', + }, + { + type: BaseFieldType.numberInput, + variable: 'limit', + label: t(`${I18N_PREFIX}.limit`), + required: true, + showConditions: [], + }, + { + type: BaseFieldType.numberInput, + variable: 'max_depth', + label: t(`${I18N_PREFIX}.maxDepth`), + required: false, + showConditions: [], + tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`), + }, + { + type: BaseFieldType.textInput, + variable: 'excludes', + label: t(`${I18N_PREFIX}.excludePaths`), + required: false, + showConditions: [], + placeholder: 'blog/*, /about/*', + }, + { + type: BaseFieldType.textInput, + variable: 'includes', + label: t(`${I18N_PREFIX}.includeOnlyPaths`), + required: false, + showConditions: [], + placeholder: 'articles/*', + }, + { + type: BaseFieldType.checkbox, + variable: 'crawl_sub_pages', + label: t(`${I18N_PREFIX}.crawlSubPage`), + required: false, + showConditions: [], + }, + { + type: BaseFieldType.checkbox, + variable: 'only_main_content', + label: t(`${I18N_PREFIX}.extractOnlyMainContent`), + required: false, + showConditions: [], + }, + ] + + return configurations +} + +export const useSchema = () => { + const { t } = useTranslation() + + const Schema = z.object({ + url: z.string().nonempty({ + message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: 'url', + }), + }).regex(/^https?:\/\//, { + message: t(`${ERROR_I18N_PREFIX}.urlError`), + }), + limit: z.number().positive({ + message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: t(`${I18N_PREFIX}.limit`), + }), + }).int(), + }).passthrough() + + return Schema +} diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/index.tsx new file mode 100644 index 0000000000..27708d9f8d --- /dev/null +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/index.tsx @@ -0,0 +1,203 @@ +'use client' +import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import { useModalContextSelector } from '@/context/modal-context' +import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' +import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets' +import { sleep } from '@/utils' +import Header from '@/app/components/datasets/create/website/base/header' +import type { FormData } from '../base/options' +import Options from '../base/options' +import { useConfigurations, useSchema } from './hooks' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' +import CrawledResult from '../base/crawled-result' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type WaterCrawlProps = { + checkedCrawlResult: CrawlResultItem[] + onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void +} + +enum Step { + init = 'init', + running = 'running', + finished = 'finished', +} + +const WaterCrawl = ({ + checkedCrawlResult, + onCheckedCrawlResultChange, + onJobIdChange, + crawlOptions, + onCrawlOptionsChange, +}: WaterCrawlProps) => { + const { t } = useTranslation() + const [step, setStep] = useState(Step.init) + const [controlFoldOptions, setControlFoldOptions] = useState(0) + const configurations = useConfigurations() + const schema = useSchema() + + useEffect(() => { + if (step !== Step.init) + setControlFoldOptions(Date.now()) + }, [step]) + + const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal) + const handleSetting = useCallback(() => { + setShowAccountSettingModal({ + payload: 'data-source', + }) + }, [setShowAccountSettingModal]) + + const isInit = step === Step.init + const isCrawlFinished = step === Step.finished + const isRunning = step === Step.running + const [crawlResult, setCrawlResult] = useState<{ + current: number + total: number + data: CrawlResultItem[] + time_consuming: number | string + } | undefined>(undefined) + const [crawlErrorMessage, setCrawlErrorMessage] = useState('') + const showError = isCrawlFinished && crawlErrorMessage + + const waitForCrawlFinished = useCallback(async (jobId: string): Promise => { + try { + const res = await checkWatercrawlTaskStatus(jobId) as any + if (res.status === 'completed') { + return { + isError: false, + data: { + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }, + } + } + if (res.status === 'error' || !res.status) { + // can't get the error message from the watercrawl api + return { + isError: true, + errorMessage: res.message, + data: { + data: [], + }, + } + } + // update the progress + setCrawlResult({ + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }) + onCheckedCrawlResultChange(res.data || []) // default select the crawl result + await sleep(2500) + return await waitForCrawlFinished(jobId) + } + catch (e: any) { + const errorBody = await e.json() + return { + isError: true, + errorMessage: errorBody.message, + data: { + data: [], + }, + } + } + }, [crawlOptions.limit, onCheckedCrawlResultChange]) + + const handleRun = useCallback(async (value: FormData) => { + const { url, ...crawlOptions } = value + onCrawlOptionsChange(crawlOptions) + setStep(Step.running) + try { + const passToServerCrawlOptions: any = { + ...crawlOptions, + } + if (crawlOptions.max_depth === '') + delete passToServerCrawlOptions.max_depth + + const res = await createWatercrawlTask({ + url, + options: passToServerCrawlOptions, + }) as any + const jobId = res.job_id + onJobIdChange(jobId) + const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) + if (isError) { + setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) + } + else { + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) // default select the crawl result + setCrawlErrorMessage('') + } + } + catch (e) { + setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) + console.log(e) + } + finally { + setStep(Step.finished) + } + }, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished]) + + return ( +
+
+
+ { + handleRun(value) + console.log('submit') + }} + /> +
+ {!isInit && ( +
+ {isRunning && ( + + )} + {showError && ( + + )} + {isCrawlFinished && !showError && ( + + )} +
+ )} +
+ ) +} +export default React.memo(WaterCrawl) diff --git a/web/app/components/rag-pipeline/components/panel/test-run/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/index.tsx index 456fe3d00d..1688fac522 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/index.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/index.tsx @@ -16,11 +16,13 @@ import Notion from './data-source/notion' import VectorSpaceFull from '@/app/components/billing/vector-space-full' import { DEFAULT_CRAWL_OPTIONS } from './consts' import Firecrawl from './data-source/website/firecrawl' +import JinaReader from './data-source/website/jina-reader' +import WaterCrawl from './data-source/website/water-crawl' const TestRunPanel = () => { const { t } = useTranslation() const [currentStep, setCurrentStep] = useState(1) - const [dataSource, setDataSource] = useState(DataSourceProvider.fireCrawl) + const [dataSource, setDataSource] = useState(DataSourceProvider.waterCrawl) const [fileList, setFiles] = useState([]) const [notionPages, setNotionPages] = useState([]) const [websitePages, setWebsitePages] = useState([]) @@ -51,8 +53,12 @@ const TestRunPanel = () => { return nextDisabled if (dataSource === DataSourceType.NOTION) return isShowVectorSpaceFull || !notionPages.length + if (dataSource === DataSourceProvider.fireCrawl + || dataSource === DataSourceProvider.jinaReader + || dataSource === DataSourceProvider.waterCrawl) + return isShowVectorSpaceFull || !websitePages.length return false - }, [dataSource, nextDisabled, isShowVectorSpaceFull, notionPages.length]) + }, [dataSource, nextDisabled, isShowVectorSpaceFull, notionPages.length, websitePages.length]) const handleClose = () => { setShowTestRunPanel?.(false) @@ -135,6 +141,24 @@ const TestRunPanel = () => { onCrawlOptionsChange={setCrawlOptions} /> )} + {dataSource === DataSourceProvider.jinaReader && ( + + )} + {dataSource === DataSourceProvider.waterCrawl && ( + + )} {isShowVectorSpaceFull && ( )}