refactor: website data source components and hooks

This commit is contained in:
twwu 2025-05-21 10:53:18 +08:00
parent cf73faf174
commit 20343facad
19 changed files with 322 additions and 886 deletions

View File

@ -23,16 +23,14 @@ const Actions = ({
return CustomActions(form)
return (
<div className='flex items-center justify-end p-4 pt-2'>
<Button
variant='primary'
disabled = {isSubmitting || !canSubmit}
loading={isSubmitting}
onClick={() => form.handleSubmit()}
>
{t('common.operation.submit')}
</Button>
</div>
<Button
variant='primary'
disabled={isSubmitting || !canSubmit}
loading={isSubmitting}
onClick={() => form.handleSubmit()}
>
{t('common.operation.submit')}
</Button>
)
}

View File

@ -156,6 +156,7 @@ const BaseField = ({
allowed_file_extensions: allowedFileExtensions,
allowed_file_types: allowedFileTypes,
allowed_file_upload_methods: allowedFileUploadMethods,
number_limits: 1,
}}
/>
)}

View File

@ -45,7 +45,7 @@ export default function WorkspaceSelector({
<MenuItems
className='absolute left-0 top-8 z-10 w-80
origin-top-right rounded-lg border-[0.5px]
border-components-panel-border bg-components-panel-bg-blur shadow-lg shadow-shadow-shadow-5 backdrop-blur-[5px]'
border-components-panel-border bg-components-panel-bg-blur shadow-lg shadow-shadow-shadow-5'
>
<div className="max-h-50 overflow-auto p-1">
{

View File

@ -6,9 +6,9 @@ import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react'
type HeaderProps = {
isInPipeline?: boolean
onClickConfiguration: () => void
onClickConfiguration?: () => void
title: string
buttonText: string
buttonText?: string
docTitle: string
docLink: string
}
@ -31,21 +31,21 @@ const Header = ({
{title}
</div>
<Divider type='vertical' className='mx-1 h-3.5' />
<Button
variant='secondary'
size='small'
className={cn(isInPipeline ? 'px-1' : 'px-1.5')}
>
<RiEqualizer2Line
className='h-4 w-4'
onClick={onClickConfiguration}
/>
{!isInPipeline && (
{!isInPipeline && (
<Button
variant='secondary'
size='small'
className='px-1.5'
>
<RiEqualizer2Line
className='h-4 w-4'
onClick={onClickConfiguration}
/>
<span className='system-xs-medium'>
{buttonText}
</span>
)}
</Button>
</Button>
)}
</div>
<a
className='system-xs-medium flex items-center gap-x-1 overflow-hidden text-text-accent'

View File

@ -0,0 +1,133 @@
'use client'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import type { CrawlResultItem } from '@/models/datasets'
import Header from '@/app/components/datasets/create/website/base/header'
import Options from '../base/options'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import CrawledResult from '../base/crawled-result'
import type { RAGPipelineVariables } from '@/models/pipeline'
import { useDatasourceNodeRun } from '@/service/use-pipeline'
import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
import { useWebCrawlerHeaderInfo } from '../../../hooks'
import type { DataSourceProvider } from '@/models/common'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type CrawlerProps = {
nodeId: string
variables: RAGPipelineVariables
checkedCrawlResult: CrawlResultItem[]
datasourceProvider: DataSourceProvider
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const Crawler = ({
nodeId,
variables,
checkedCrawlResult,
datasourceProvider,
onCheckedCrawlResultChange,
onJobIdChange,
}: CrawlerProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const pipelineId = useDatasetDetailContextWithSelector(s => s.dataset?.pipeline_id)
const headerInfoMap = useWebCrawlerHeaderInfo()
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
result: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const { mutateAsync: runDatasourceNode } = useDatasourceNodeRun()
const handleRun = useCallback(async (value: Record<string, any>) => {
setStep(Step.running)
await runDatasourceNode({
node_id: nodeId,
pipeline_id: pipelineId!,
inputs: value,
}, {
onSuccess: (res: any) => {
const jobId = res.job_id
onJobIdChange(jobId)
setCrawlResult(res)
onCheckedCrawlResultChange(res.result || []) // default select the crawl result
setCrawlErrorMessage('')
},
onError: (error) => {
setCrawlErrorMessage(error.message || t(`${I18N_PREFIX}.unknownError`))
},
onSettled: () => {
setStep(Step.finished)
},
})
}, [runDatasourceNode, nodeId, pipelineId, onJobIdChange, onCheckedCrawlResultChange, t])
return (
<div>
<Header
isInPipeline
{...headerInfoMap[datasourceProvider]}
/>
<div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle'>
<Options
variables={variables}
isRunning={isRunning}
controlFoldOptions={controlFoldOptions}
onSubmit={(value) => {
handleRun(value)
}}
/>
</div>
{!isInit && (
<div className='relative'>
{isRunning && (
<Crawling
crawledNum={0}
totalNum={0}
/>
)}
{showError && (
<ErrorMessage
className='mt-2'
title={t(`${I18N_PREFIX}.exceptionErrorTitle`)}
errorMsg={crawlErrorMessage}
/>
)}
{isCrawlFinished && !showError && (
<CrawledResult
className='mt-2'
list={crawlResult?.result || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
/>
)}
</div>
)}
</div>
)
}
export default React.memo(Crawler)

View File

@ -0,0 +1,50 @@
import type { BaseConfiguration, BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import { PipelineInputVarType, type RAGPipelineVariables } from '@/models/pipeline'
import { useMemo } from 'react'
export const useInitialData = (variables: RAGPipelineVariables) => {
const initialData = useMemo(() => {
const initialData: Record<string, any> = {}
variables.forEach((item) => {
if ([PipelineInputVarType.textInput, PipelineInputVarType.paragraph, PipelineInputVarType.select].includes(item.type))
initialData[item.variable] = item.default_value || ''
if (item.type === PipelineInputVarType.number)
initialData[item.variable] = item.default_value || 0
if ([PipelineInputVarType.singleFile, PipelineInputVarType.multiFiles].includes(item.type))
initialData[item.variable] = item.default_value || []
if (item.type === PipelineInputVarType.checkbox)
initialData[item.variable] = item.default_value || true
})
return initialData
}, [variables])
return initialData
}
export const useConfigurations = (variables: RAGPipelineVariables) => {
const configurations = useMemo(() => {
const configurations: BaseConfiguration[] = []
variables.forEach((item) => {
configurations.push({
type: item.type as unknown as BaseFieldType,
variable: item.variable,
label: item.label,
required: item.required,
placeholder: item.placeholder,
tooltip: item.tooltips,
options: item.options?.map(option => ({
label: option,
value: option,
})),
maxLength: item.max_length,
showConditions: [],
allowedFileUploadMethods: item.allowed_file_upload_methods,
allowedFileTypes: item.allowed_file_types,
allowedFileExtensions: item.allowed_file_extensions,
})
})
return configurations
}, [variables])
return configurations
}

View File

@ -1,35 +1,39 @@
import Button from '@/app/components/base/button'
import { useAppForm } from '@/app/components/base/form'
import BaseField from '@/app/components/base/form/form-scenarios/base/field'
import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types'
import { ArrowDownRoundFill } from '@/app/components/base/icons/src/vender/solid/general'
import cn from '@/utils/classnames'
import { RiPlayLargeLine } from '@remixicon/react'
import { useBoolean } from 'ahooks'
import { useEffect } from 'react'
import { useEffect, useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import Toast from '@/app/components/base/toast'
import type { ZodSchema } from 'zod'
import type { RAGPipelineVariables } from '@/models/pipeline'
import { useConfigurations, useInitialData } from './hooks'
import { generateZodSchema } from '@/app/components/base/form/form-scenarios/base/utils'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type OptionsProps = {
initialData: Record<string, any>
configurations: BaseConfiguration[]
variables: RAGPipelineVariables
isRunning: boolean
controlFoldOptions?: number
schema: ZodSchema
onSubmit: (data: Record<string, any>) => void
}
const Options = ({
initialData,
configurations,
variables,
isRunning,
controlFoldOptions,
schema,
onSubmit,
}: OptionsProps) => {
const { t } = useTranslation()
const initialData = useInitialData(variables)
const configurations = useConfigurations(variables)
const schema = useMemo(() => {
return generateZodSchema(configurations)
}, [configurations])
const form = useAppForm({
defaultValues: initialData,
validators: {
@ -53,8 +57,6 @@ const Options = ({
},
})
const { t } = useTranslation()
const [fold, {
toggle: foldToggle,
setTrue: foldHide,

View File

@ -1,89 +0,0 @@
import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types'
import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import { useTranslation } from 'react-i18next'
import { z } from 'zod'
const ERROR_I18N_PREFIX = 'common.errorMsg'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
export const useConfigurations = () => {
const { t } = useTranslation()
const configurations: BaseConfiguration[] = [
{
type: BaseFieldType.textInput,
variable: 'url',
label: 'URL',
required: true,
showConditions: [],
placeholder: 'https://docs.dify.ai',
},
{
type: BaseFieldType.numberInput,
variable: 'limit',
label: t(`${I18N_PREFIX}.limit`),
required: true,
showConditions: [],
},
{
type: BaseFieldType.numberInput,
variable: 'max_depth',
label: t(`${I18N_PREFIX}.maxDepth`),
required: false,
showConditions: [],
tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`),
},
{
type: BaseFieldType.textInput,
variable: 'excludes',
label: t(`${I18N_PREFIX}.excludePaths`),
required: false,
showConditions: [],
placeholder: 'blog/*, /about/*',
},
{
type: BaseFieldType.textInput,
variable: 'includes',
label: t(`${I18N_PREFIX}.includeOnlyPaths`),
required: false,
showConditions: [],
placeholder: 'articles/*',
},
{
type: BaseFieldType.checkbox,
variable: 'crawl_sub_pages',
label: t(`${I18N_PREFIX}.crawlSubPage`),
required: false,
showConditions: [],
},
{
type: BaseFieldType.checkbox,
variable: 'only_main_content',
label: t(`${I18N_PREFIX}.extractOnlyMainContent`),
required: false,
showConditions: [],
},
]
return configurations
}
export const useSchema = () => {
const { t } = useTranslation()
const Schema = z.object({
url: z.string().nonempty({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: 'url',
}),
}).regex(/^https?:\/\//, {
message: t(`${ERROR_I18N_PREFIX}.urlError`),
}),
limit: z.number().positive({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: t(`${I18N_PREFIX}.limit`),
}),
}).int(),
}).passthrough()
return Schema
}

View File

@ -1,202 +1,34 @@
'use client'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useModalContextSelector } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { checkFirecrawlTaskStatus, createFirecrawlTask } from '@/service/datasets'
import { sleep } from '@/utils'
import Header from '@/app/components/datasets/create/website/base/header'
import Options from '../base/options'
import { useConfigurations, useSchema } from './hooks'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import CrawledResult from '../base/crawled-result'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
import React from 'react'
import type { CrawlResultItem } from '@/models/datasets'
import type { RAGPipelineVariables } from '@/models/pipeline'
import Crawler from '../base/crawler'
import { DataSourceProvider } from '@/models/common'
type FireCrawlProps = {
nodeId: string
variables: RAGPipelineVariables
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const FireCrawl = ({
nodeId,
variables,
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: FireCrawlProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const configurations = useConfigurations()
const schema = useSchema()
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const setShowAccountSettingModal = useModalContextSelector(s => s.setShowAccountSettingModal)
const handleSetting = useCallback(() => {
setShowAccountSettingModal({
payload: 'data-source',
})
}, [setShowAccountSettingModal])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
current: number
total: number
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const waitForCrawlFinished = useCallback(async (jobId: string) => {
try {
const res = await checkFirecrawlTaskStatus(jobId) as any
if (res.status === 'completed') {
return {
isError: false,
data: {
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
},
}
}
if (res.status === 'error' || !res.status) {
// can't get the error message from the firecrawl api
return {
isError: true,
errorMessage: res.message,
data: {
data: [],
},
}
}
// update the progress
setCrawlResult({
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
})
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
await sleep(2500)
return await waitForCrawlFinished(jobId)
}
catch (e: any) {
const errorBody = await e.json()
return {
isError: true,
errorMessage: errorBody.message,
data: {
data: [],
},
}
}
}, [crawlOptions.limit, onCheckedCrawlResultChange])
const handleRun = useCallback(async (value: Record<string, any>) => {
const { url, ...crawlOptions } = value
onCrawlOptionsChange(crawlOptions as CrawlOptions)
setStep(Step.running)
try {
const passToServerCrawlOptions: any = {
...crawlOptions,
}
if (crawlOptions.max_depth === '')
delete passToServerCrawlOptions.max_depth
const res = await createFirecrawlTask({
url,
options: passToServerCrawlOptions,
}) as any
const jobId = res.job_id
onJobIdChange(jobId)
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
if (isError) {
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
}
else {
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
setCrawlErrorMessage('')
}
}
catch (e) {
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
console.log(e)
}
finally {
setStep(Step.finished)
}
}, [onCrawlOptionsChange, onJobIdChange, t, waitForCrawlFinished, onCheckedCrawlResultChange])
return (
<div>
<Header
isInPipeline
onClickConfiguration={handleSetting}
title={t(`${I18N_PREFIX}.firecrawlTitle`)}
buttonText={t(`${I18N_PREFIX}.configureFirecrawl`)}
docTitle={t(`${I18N_PREFIX}.firecrawlDoc`)}
docLink={'https://docs.firecrawl.dev/introduction'}
/>
<div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle'>
<Options
initialData={{
...crawlOptions,
url: '',
}}
configurations={configurations}
isRunning={isRunning}
controlFoldOptions={controlFoldOptions}
schema={schema}
onSubmit={(value) => {
handleRun(value)
console.log('submit')
}}
/>
</div>
{!isInit && (
<div className='relative'>
{isRunning && (
<Crawling
crawledNum={crawlResult?.current || 0}
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
/>
)}
{showError && (
<ErrorMessage
className='mt-2'
title={t(`${I18N_PREFIX}.exceptionErrorTitle`)}
errorMsg={crawlErrorMessage}
/>
)}
{isCrawlFinished && !showError && (
<CrawledResult
className='mt-2'
list={crawlResult?.data || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
/>
)}
</div>
)}
</div>
<Crawler
nodeId={nodeId}
variables={variables}
checkedCrawlResult={checkedCrawlResult}
datasourceProvider={DataSourceProvider.fireCrawl}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
/>
)
}
export default React.memo(FireCrawl)
export default FireCrawl

View File

@ -1,66 +0,0 @@
import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types'
import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import { useTranslation } from 'react-i18next'
import { z } from 'zod'
const ERROR_I18N_PREFIX = 'common.errorMsg'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
export const useConfigurations = () => {
const { t } = useTranslation()
const configurations: BaseConfiguration[] = [
{
type: BaseFieldType.textInput,
variable: 'url',
label: 'URL',
required: true,
showConditions: [],
placeholder: 'https://docs.dify.ai',
},
{
type: BaseFieldType.numberInput,
variable: 'limit',
label: t(`${I18N_PREFIX}.limit`),
required: true,
showConditions: [],
},
{
type: BaseFieldType.checkbox,
variable: 'crawl_sub_pages',
label: t(`${I18N_PREFIX}.crawlSubPage`),
required: false,
showConditions: [],
},
{
type: BaseFieldType.checkbox,
variable: 'use_sitemap',
label: t(`${I18N_PREFIX}.useSitemap`),
tooltip: t(`${I18N_PREFIX}.useSitemapTooltip`),
required: false,
showConditions: [],
},
]
return configurations
}
export const useSchema = () => {
const { t } = useTranslation()
const Schema = z.object({
url: z.string().nonempty({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: 'url',
}),
}).regex(/^https?:\/\//, {
message: t(`${ERROR_I18N_PREFIX}.urlError`),
}),
limit: z.number().positive({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: t(`${I18N_PREFIX}.limit`),
}),
}).int(),
}).passthrough()
return Schema
}

View File

@ -1,215 +1,34 @@
'use client'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import CrawledResult from '../base/crawled-result'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import { useModalContextSelector } from '@/context/modal-context'
import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
import { sleep } from '@/utils'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import Header from '@/app/components/datasets/create/website/base/header'
import Options from '../base/options'
import { useConfigurations, useSchema } from './hooks'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
import React from 'react'
import type { CrawlResultItem } from '@/models/datasets'
import type { RAGPipelineVariables } from '@/models/pipeline'
import Crawler from '../base/crawler'
import { DataSourceProvider } from '@/models/common'
type JinaReaderProps = {
nodeId: string
variables: RAGPipelineVariables
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const JinaReader = ({
nodeId,
variables,
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: JinaReaderProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const configurations = useConfigurations()
const schema = useSchema()
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal)
const handleSetting = useCallback(() => {
setShowAccountSettingModal({
payload: 'data-source',
})
}, [setShowAccountSettingModal])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
current: number
total: number
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const waitForCrawlFinished = useCallback(async (jobId: string) => {
try {
const res = await checkJinaReaderTaskStatus(jobId) as any
if (res.status === 'completed') {
return {
isError: false,
data: {
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
},
}
}
if (res.status === 'failed' || !res.status) {
return {
isError: true,
errorMessage: res.message,
data: {
data: [],
},
}
}
// update the progress
setCrawlResult({
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
})
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
await sleep(2500)
return await waitForCrawlFinished(jobId)
}
catch (e: any) {
const errorBody = await e.json()
return {
isError: true,
errorMessage: errorBody.message,
data: {
data: [],
},
}
}
}, [crawlOptions.limit, onCheckedCrawlResultChange])
const handleRun = useCallback(async (value: Record<string, any>) => {
const { url, ...crawlOptions } = value
onCrawlOptionsChange(crawlOptions as CrawlOptions)
setStep(Step.running)
try {
const startTime = Date.now()
const res = await createJinaReaderTask({
url,
options: crawlOptions,
}) as any
if (res.data) {
const data = {
current: 1,
total: 1,
data: [{
title: res.data.title,
markdown: res.data.content,
description: res.data.description,
source_url: res.data.url,
}],
time_consuming: (Date.now() - startTime) / 1000,
}
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || [])
setCrawlErrorMessage('')
}
else if (res.job_id) {
const jobId = res.job_id
onJobIdChange(jobId)
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
if (isError) {
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
}
else {
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
setCrawlErrorMessage('')
}
}
}
catch (e) {
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
console.log(e)
}
finally {
setStep(Step.finished)
}
}, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished])
return (
<div>
<Header
isInPipeline
onClickConfiguration={handleSetting}
title={t(`${I18N_PREFIX}.jinaReaderTitle`)}
buttonText={t(`${I18N_PREFIX}.configureJinaReader`)}
docTitle={t(`${I18N_PREFIX}.jinaReaderDoc`)}
docLink={'https://jina.ai/reader'}
/>
<div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle'>
<Options
initialData={{
...crawlOptions,
url: '',
}}
configurations={configurations}
isRunning={isRunning}
controlFoldOptions={controlFoldOptions}
schema={schema}
onSubmit={(value) => {
handleRun(value)
console.log('submit')
}}
/>
</div>
{!isInit && (
<div className='relative'>
{isRunning && (
<Crawling
crawledNum={crawlResult?.current || 0}
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
/>
)}
{showError && (
<ErrorMessage
className='mt-2'
title={t(`${I18N_PREFIX}.exceptionErrorTitle`)}
errorMsg={crawlErrorMessage}
/>
)}
{isCrawlFinished && !showError && (
<CrawledResult
className='mt-2'
list={crawlResult?.data || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
/>
)}
</div>
)}
</div>
<Crawler
nodeId={nodeId}
variables={variables}
checkedCrawlResult={checkedCrawlResult}
datasourceProvider={DataSourceProvider.jinaReader}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
/>
)
}
export default React.memo(JinaReader)

View File

@ -1,89 +0,0 @@
import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types'
import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import { useTranslation } from 'react-i18next'
import { z } from 'zod'
const ERROR_I18N_PREFIX = 'common.errorMsg'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
export const useConfigurations = () => {
const { t } = useTranslation()
const configurations: BaseConfiguration[] = [
{
type: BaseFieldType.textInput,
variable: 'url',
label: 'URL',
required: true,
showConditions: [],
placeholder: 'https://docs.dify.ai',
},
{
type: BaseFieldType.numberInput,
variable: 'limit',
label: t(`${I18N_PREFIX}.limit`),
required: true,
showConditions: [],
},
{
type: BaseFieldType.numberInput,
variable: 'max_depth',
label: t(`${I18N_PREFIX}.maxDepth`),
required: false,
showConditions: [],
tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`),
},
{
type: BaseFieldType.textInput,
variable: 'excludes',
label: t(`${I18N_PREFIX}.excludePaths`),
required: false,
showConditions: [],
placeholder: 'blog/*, /about/*',
},
{
type: BaseFieldType.textInput,
variable: 'includes',
label: t(`${I18N_PREFIX}.includeOnlyPaths`),
required: false,
showConditions: [],
placeholder: 'articles/*',
},
{
type: BaseFieldType.checkbox,
variable: 'crawl_sub_pages',
label: t(`${I18N_PREFIX}.crawlSubPage`),
required: false,
showConditions: [],
},
{
type: BaseFieldType.checkbox,
variable: 'only_main_content',
label: t(`${I18N_PREFIX}.extractOnlyMainContent`),
required: false,
showConditions: [],
},
]
return configurations
}
export const useSchema = () => {
const { t } = useTranslation()
const Schema = z.object({
url: z.string().nonempty({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: 'url',
}),
}).regex(/^https?:\/\//, {
message: t(`${ERROR_I18N_PREFIX}.urlError`),
}),
limit: z.number().positive({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: t(`${I18N_PREFIX}.limit`),
}),
}).int(),
}).passthrough()
return Schema
}

View File

@ -1,202 +1,34 @@
'use client'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useModalContextSelector } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets'
import { sleep } from '@/utils'
import Header from '@/app/components/datasets/create/website/base/header'
import Options from '../base/options'
import { useConfigurations, useSchema } from './hooks'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import CrawledResult from '../base/crawled-result'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
import React from 'react'
import type { CrawlResultItem } from '@/models/datasets'
import type { RAGPipelineVariables } from '@/models/pipeline'
import Crawler from '../base/crawler'
import { DataSourceProvider } from '@/models/common'
type WaterCrawlProps = {
nodeId: string
variables: RAGPipelineVariables
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const WaterCrawl = ({
nodeId,
variables,
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: WaterCrawlProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const configurations = useConfigurations()
const schema = useSchema()
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal)
const handleSetting = useCallback(() => {
setShowAccountSettingModal({
payload: 'data-source',
})
}, [setShowAccountSettingModal])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
current: number
total: number
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const waitForCrawlFinished = useCallback(async (jobId: string): Promise<any> => {
try {
const res = await checkWatercrawlTaskStatus(jobId) as any
if (res.status === 'completed') {
return {
isError: false,
data: {
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
},
}
}
if (res.status === 'error' || !res.status) {
// can't get the error message from the watercrawl api
return {
isError: true,
errorMessage: res.message,
data: {
data: [],
},
}
}
// update the progress
setCrawlResult({
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
})
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
await sleep(2500)
return await waitForCrawlFinished(jobId)
}
catch (e: any) {
const errorBody = await e.json()
return {
isError: true,
errorMessage: errorBody.message,
data: {
data: [],
},
}
}
}, [crawlOptions.limit, onCheckedCrawlResultChange])
const handleRun = useCallback(async (value: Record<string, any>) => {
const { url, ...crawlOptions } = value
onCrawlOptionsChange(crawlOptions as CrawlOptions)
setStep(Step.running)
try {
const passToServerCrawlOptions: any = {
...crawlOptions,
}
if (crawlOptions.max_depth === '')
delete passToServerCrawlOptions.max_depth
const res = await createWatercrawlTask({
url,
options: passToServerCrawlOptions,
}) as any
const jobId = res.job_id
onJobIdChange(jobId)
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
if (isError) {
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
}
else {
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
setCrawlErrorMessage('')
}
}
catch (e) {
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
console.log(e)
}
finally {
setStep(Step.finished)
}
}, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished])
return (
<div>
<Header
isInPipeline
onClickConfiguration={handleSetting}
title={t(`${I18N_PREFIX}.watercrawlTitle`)}
buttonText={t(`${I18N_PREFIX}.configureWatercrawl`)}
docTitle={t(`${I18N_PREFIX}.watercrawlDoc`)}
docLink={'https://docs.watercrawl.dev/'}
/>
<div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle'>
<Options
initialData={{
...crawlOptions,
url: '',
}}
configurations={configurations}
isRunning={isRunning}
controlFoldOptions={controlFoldOptions}
schema={schema}
onSubmit={(value) => {
handleRun(value)
console.log('submit')
}}
/>
</div>
{!isInit && (
<div className='relative'>
{isRunning && (
<Crawling
crawledNum={crawlResult?.current || 0}
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
/>
)}
{showError && (
<ErrorMessage
className='mt-2'
title={t(`${I18N_PREFIX}.exceptionErrorTitle`)}
errorMsg={crawlErrorMessage}
/>
)}
{isCrawlFinished && !showError && (
<CrawledResult
className='mt-2'
list={crawlResult?.data || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
/>
)}
</div>
)}
</div>
<Crawler
nodeId={nodeId}
variables={variables}
checkedCrawlResult={checkedCrawlResult}
datasourceProvider={DataSourceProvider.jinaReader}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
/>
)
}
export default React.memo(WaterCrawl)

View File

@ -49,7 +49,7 @@ export const useConfigurations = (datasourceNodeId: string) => {
value: option,
})),
showConditions: [],
default: item.default,
default: item.default_value,
}))
return configs
}, [paramsConfig])

View File

@ -50,7 +50,7 @@ export const useDatasourceOptions = () => {
return {
nodeId: node.id,
type,
config: {},
variables: node.data.variables,
}
})
}, [nodes])
@ -98,3 +98,31 @@ export const useDatasourceOptions = () => {
}, [datasources, t])
return { datasources, options }
}
export const useWebCrawlerHeaderInfo = () => {
const { t } = useTranslation()
const I18N_PREFIX = 'datasetCreation.stepOne.website'
const headerInfoMap: Record<DataSourceProvider, {
title: string
docTitle: string
docLink: string
}> = {
[DataSourceProvider.fireCrawl]: {
title: t(`${I18N_PREFIX}.firecrawlTitle`),
docTitle: t(`${I18N_PREFIX}.firecrawlDoc`),
docLink: 'https://docs.firecrawl.dev/introduction',
},
[DataSourceProvider.jinaReader]: {
title: t(`${I18N_PREFIX}.jinaReaderTitle`),
docTitle: t(`${I18N_PREFIX}.jinaReaderDoc`),
docLink: 'https://jina.ai/reader',
},
[DataSourceProvider.waterCrawl]: {
title: t(`${I18N_PREFIX}.watercrawlTitle`),
docTitle: t(`${I18N_PREFIX}.watercrawlDoc`),
docLink: 'https://docs.watercrawl.dev/',
},
}
return headerInfoMap
}

View File

@ -4,7 +4,7 @@ import { useCallback, useMemo, useState } from 'react'
import StepIndicator from './step-indicator'
import { useTestRunSteps } from './hooks'
import DataSourceOptions from './data-source-options'
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
import type { CrawlResultItem, FileItem } from '@/models/datasets'
import { DataSourceType } from '@/models/datasets'
import LocalFile from './data-source/local-file'
import produce from 'immer'
@ -12,7 +12,6 @@ import { useProviderContextSelector } from '@/context/provider-context'
import { DataSourceProvider, type NotionPage } from '@/models/common'
import Notion from './data-source/notion'
import VectorSpaceFull from '@/app/components/billing/vector-space-full'
import { DEFAULT_CRAWL_OPTIONS } from './consts'
import Firecrawl from './data-source/website/firecrawl'
import JinaReader from './data-source/website/jina-reader'
import WaterCrawl from './data-source/website/water-crawl'
@ -31,7 +30,6 @@ const TestRunPanel = () => {
const [notionPages, setNotionPages] = useState<NotionPage[]>([])
const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([])
const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')
const [crawlOptions, setCrawlOptions] = useState<CrawlOptions>(DEFAULT_CRAWL_OPTIONS)
const plan = useProviderContextSelector(state => state.plan)
const enableBilling = useProviderContextSelector(state => state.enableBilling)
@ -159,35 +157,36 @@ const TestRunPanel = () => {
)}
{datasource?.type === DataSourceType.NOTION && (
<Notion
nodeId={datasource?.nodeId || ''}
notionPages={notionPages}
updateNotionPages={updateNotionPages}
/>
)}
{datasource?.type === DataSourceProvider.fireCrawl && (
<Firecrawl
nodeId={datasource?.nodeId || ''}
variables={datasource?.variables}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={setWebsitePages}
onJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
)}
{datasource?.type === DataSourceProvider.jinaReader && (
<JinaReader
nodeId={datasource?.nodeId || ''}
variables={datasource?.variables}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={setWebsitePages}
onJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
)}
{datasource?.type === DataSourceProvider.waterCrawl && (
<WaterCrawl
nodeId={datasource?.nodeId || ''}
variables={datasource?.variables}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={setWebsitePages}
onJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
)}
{isShowVectorSpaceFull && (

View File

@ -1,5 +1,6 @@
import type { DataSourceProvider } from '@/models/common'
import type { DataSourceType } from '@/models/datasets'
import type { RAGPipelineVariables } from '@/models/pipeline'
export enum TestRunStep {
dataSource = 'dataSource',
@ -15,5 +16,5 @@ export type DataSourceOption = {
export type Datasource = {
nodeId: string
type: DataSourceType | DataSourceProvider
config: any
variables: RAGPipelineVariables
}

View File

@ -98,12 +98,12 @@ export type PipelineCheckDependenciesResponse = {
}
export enum PipelineInputVarType {
textInput = 'text-input',
textInput = 'textInput',
paragraph = 'paragraph',
select = 'select',
number = 'number',
number = 'numberInput',
singleFile = 'file',
multiFiles = 'file-list',
multiFiles = 'fileList',
checkbox = 'checkbox',
}
@ -142,23 +142,4 @@ export type PipelineDatasourceNodeRunRequest = {
inputs: Record<string, any>
}
export type PipelineDatasourceNodeRunResponse = {
id: string
inputs: Record<string, any>
process_data: Record<string, any>
outputs: Record<string, any>
status: string
error?: string
elapsed_time: number
execution_metadata: {
total_tokens: number
total_price: number
currency?: string
}
extras: {
icon: string | object
}
created_at: string
created_by: string
finished_at: string
}
export type PipelineDatasourceNodeRunResponse = Record<string, any>

View File

@ -9,6 +9,7 @@ import type {
ImportPipelineDSLResponse,
PipelineCheckDependenciesResponse,
PipelineDatasourceNodeRunRequest,
PipelineDatasourceNodeRunResponse,
PipelineProcessingParamsRequest,
PipelineProcessingParamsResponse,
PipelineTemplateByIdResponse,
@ -115,15 +116,18 @@ export const useCheckPipelineDependencies = (
})
}
export const useDatasourceNodeRun = () => {
export const useDatasourceNodeRun = (
mutationOptions: MutationOptions<PipelineDatasourceNodeRunResponse, Error, PipelineDatasourceNodeRunRequest> = {},
) => {
return useMutation({
mutationKey: [NAME_SPACE, 'datasource-node-run'],
mutationFn: (request: PipelineDatasourceNodeRunRequest) => {
const { pipeline_id, node_id, ...rest } = request
return post(`/rag/pipelines/${pipeline_id}/workflows/published/nodes/${node_id}/run`, {
return post<PipelineDatasourceNodeRunResponse>(`/rag/pipelines/${pipeline_id}/workflows/published/nodes/${node_id}/run`, {
body: rest,
})
},
...mutationOptions,
})
}