feat: Add JinaReader and WaterCrawl components with configurations and schema handling

This commit is contained in:
twwu 2025-04-28 14:33:01 +08:00
parent f71b0eccb2
commit 8f07e088f5
9 changed files with 621 additions and 17 deletions

View File

@ -23,7 +23,7 @@ const Header = ({
}: HeaderProps) => {
return (
<div className='flex items-center gap-x-2'>
<div className='flex grow items-center gap-x-1'>
<div className='flex shrink-0 grow items-center gap-x-1'>
<div className={cn(
'text-text-secondary',
isInPipeline ? 'system-sm-semibold' : 'system-md-semibold',
@ -48,13 +48,13 @@ const Header = ({
</Button>
</div>
<a
className='system-xs-medium flex shrink-0 items-center gap-x-1 text-text-accent'
className='system-xs-medium flex items-center gap-x-1 overflow-hidden text-text-accent'
href={docLink}
target='_blank'
rel='noopener noreferrer'
>
<RiBookOpenLine className='size-3.5' />
<span>{docTitle}</span>
<RiBookOpenLine className='size-3.5 shrink-0' />
<span className='grow truncate' title={docTitle}>{docTitle}</span>
</a>
</div>
)

View File

@ -9,8 +9,8 @@ import { RiPlayLargeLine } from '@remixicon/react'
import { useBoolean } from 'ahooks'
import { useEffect } from 'react'
import { useTranslation } from 'react-i18next'
import { useSchema } from './hooks'
import Toast from '@/app/components/base/toast'
import type { ZodSchema } from 'zod'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
@ -23,6 +23,7 @@ type OptionsProps = {
configurations: BaseConfiguration<FormData>[]
isRunning: boolean
controlFoldOptions?: number
schema: ZodSchema
onSubmit: (data: FormData) => void
}
@ -31,9 +32,9 @@ const Options = ({
configurations,
isRunning,
controlFoldOptions,
schema,
onSubmit,
}: OptionsProps) => {
const schema = useSchema()
const form = useAppForm({
defaultValues: initialData,
validators: {

View File

@ -1,7 +1,7 @@
import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types'
import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import { useTranslation } from 'react-i18next'
import type { FormData } from './options'
import type { FormData } from '../base/options'
import { z } from 'zod'
const ERROR_I18N_PREFIX = 'common.errorMsg'
@ -22,7 +22,7 @@ export const useConfigurations = () => {
type: BaseFieldType.numberInput,
variable: 'limit',
label: t(`${I18N_PREFIX}.limit`),
required: false,
required: true,
showConditions: [],
},
{
@ -31,6 +31,7 @@ export const useConfigurations = () => {
label: t(`${I18N_PREFIX}.maxDepth`),
required: false,
showConditions: [],
tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`),
},
{
type: BaseFieldType.textInput,

View File

@ -1,5 +1,4 @@
'use client'
import type { FC } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useModalContextSelector } from '@/context/modal-context'
@ -7,16 +6,16 @@ import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { checkFirecrawlTaskStatus, createFirecrawlTask } from '@/service/datasets'
import { sleep } from '@/utils'
import Header from '@/app/components/datasets/create/website/base/header'
import type { FormData } from './options'
import Options from './options'
import { useConfigurations } from './hooks'
import type { FormData } from '../base/options'
import Options from '../base/options'
import { useConfigurations, useSchema } from './hooks'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import CrawledResult from '../base/crawled-result'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
type FireCrawlProps = {
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
@ -30,17 +29,19 @@ enum Step {
finished = 'finished',
}
const FireCrawl: FC<Props> = ({
const FireCrawl = ({
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}) => {
}: FireCrawlProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const configurations = useConfigurations()
const schema = useSchema()
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
@ -163,6 +164,7 @@ const FireCrawl: FC<Props> = ({
configurations={configurations}
isRunning={isRunning}
controlFoldOptions={controlFoldOptions}
schema={schema}
onSubmit={(value) => {
handleRun(value)
console.log('submit')

View File

@ -0,0 +1,67 @@
import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types'
import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import { useTranslation } from 'react-i18next'
import type { FormData } from '../base/options'
import { z } from 'zod'
const ERROR_I18N_PREFIX = 'common.errorMsg'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
export const useConfigurations = () => {
const { t } = useTranslation()
const configurations: BaseConfiguration<FormData>[] = [
{
type: BaseFieldType.textInput,
variable: 'url',
label: 'URL',
required: true,
showConditions: [],
placeholder: 'https://docs.dify.ai',
},
{
type: BaseFieldType.numberInput,
variable: 'limit',
label: t(`${I18N_PREFIX}.limit`),
required: true,
showConditions: [],
},
{
type: BaseFieldType.checkbox,
variable: 'crawl_sub_pages',
label: t(`${I18N_PREFIX}.crawlSubPage`),
required: false,
showConditions: [],
},
{
type: BaseFieldType.checkbox,
variable: 'use_sitemap',
label: t(`${I18N_PREFIX}.useSitemap`),
tooltip: t(`${I18N_PREFIX}.useSitemapTooltip`),
required: false,
showConditions: [],
},
]
return configurations
}
export const useSchema = () => {
const { t } = useTranslation()
const Schema = z.object({
url: z.string().nonempty({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: 'url',
}),
}).regex(/^https?:\/\//, {
message: t(`${ERROR_I18N_PREFIX}.urlError`),
}),
limit: z.number().positive({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: t(`${I18N_PREFIX}.limit`),
}),
}).int(),
}).passthrough()
return Schema
}

View File

@ -0,0 +1,216 @@
'use client'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import CrawledResult from '../base/crawled-result'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import { useModalContextSelector } from '@/context/modal-context'
import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
import { sleep } from '@/utils'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import Header from '@/app/components/datasets/create/website/base/header'
import type { FormData } from '../base/options'
import Options from '../base/options'
import { useConfigurations, useSchema } from './hooks'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type JinaReaderProps = {
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const JinaReader = ({
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: JinaReaderProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const configurations = useConfigurations()
const schema = useSchema()
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal)
const handleSetting = useCallback(() => {
setShowAccountSettingModal({
payload: 'data-source',
})
}, [setShowAccountSettingModal])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
current: number
total: number
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const waitForCrawlFinished = useCallback(async (jobId: string) => {
try {
const res = await checkJinaReaderTaskStatus(jobId) as any
if (res.status === 'completed') {
return {
isError: false,
data: {
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
},
}
}
if (res.status === 'failed' || !res.status) {
return {
isError: true,
errorMessage: res.message,
data: {
data: [],
},
}
}
// update the progress
setCrawlResult({
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
})
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
await sleep(2500)
return await waitForCrawlFinished(jobId)
}
catch (e: any) {
const errorBody = await e.json()
return {
isError: true,
errorMessage: errorBody.message,
data: {
data: [],
},
}
}
}, [crawlOptions.limit, onCheckedCrawlResultChange])
const handleRun = useCallback(async (value: FormData) => {
const { url, ...crawlOptions } = value
onCrawlOptionsChange(crawlOptions)
setStep(Step.running)
try {
const startTime = Date.now()
const res = await createJinaReaderTask({
url,
options: crawlOptions,
}) as any
if (res.data) {
const data = {
current: 1,
total: 1,
data: [{
title: res.data.title,
markdown: res.data.content,
description: res.data.description,
source_url: res.data.url,
}],
time_consuming: (Date.now() - startTime) / 1000,
}
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || [])
setCrawlErrorMessage('')
}
else if (res.job_id) {
const jobId = res.job_id
onJobIdChange(jobId)
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
if (isError) {
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
}
else {
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
setCrawlErrorMessage('')
}
}
}
catch (e) {
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
console.log(e)
}
finally {
setStep(Step.finished)
}
}, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished])
return (
<div>
<Header
isInPipeline
onClickConfiguration={handleSetting}
title={t(`${I18N_PREFIX}.jinaReaderTitle`)}
buttonText={t(`${I18N_PREFIX}.configureJinaReader`)}
docTitle={t(`${I18N_PREFIX}.jinaReaderDoc`)}
docLink={'https://jina.ai/reader'}
/>
<div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle'>
<Options
initialData={{
...crawlOptions,
url: '',
}}
configurations={configurations}
isRunning={isRunning}
controlFoldOptions={controlFoldOptions}
schema={schema}
onSubmit={(value) => {
handleRun(value)
console.log('submit')
}}
/>
</div>
{!isInit && (
<div className='relative'>
{isRunning && (
<Crawling
crawledNum={crawlResult?.current || 0}
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
/>
)}
{showError && (
<ErrorMessage
className='mt-2'
title={t(`${I18N_PREFIX}.exceptionErrorTitle`)}
errorMsg={crawlErrorMessage}
/>
)}
{isCrawlFinished && !showError && (
<CrawledResult
className='mt-2'
list={crawlResult?.data || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
/>
)}
</div>
)}
</div>
)
}
export default React.memo(JinaReader)

View File

@ -0,0 +1,90 @@
import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types'
import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import { useTranslation } from 'react-i18next'
import type { FormData } from '../base/options'
import { z } from 'zod'
const ERROR_I18N_PREFIX = 'common.errorMsg'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
export const useConfigurations = () => {
const { t } = useTranslation()
const configurations: BaseConfiguration<FormData>[] = [
{
type: BaseFieldType.textInput,
variable: 'url',
label: 'URL',
required: true,
showConditions: [],
placeholder: 'https://docs.dify.ai',
},
{
type: BaseFieldType.numberInput,
variable: 'limit',
label: t(`${I18N_PREFIX}.limit`),
required: true,
showConditions: [],
},
{
type: BaseFieldType.numberInput,
variable: 'max_depth',
label: t(`${I18N_PREFIX}.maxDepth`),
required: false,
showConditions: [],
tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`),
},
{
type: BaseFieldType.textInput,
variable: 'excludes',
label: t(`${I18N_PREFIX}.excludePaths`),
required: false,
showConditions: [],
placeholder: 'blog/*, /about/*',
},
{
type: BaseFieldType.textInput,
variable: 'includes',
label: t(`${I18N_PREFIX}.includeOnlyPaths`),
required: false,
showConditions: [],
placeholder: 'articles/*',
},
{
type: BaseFieldType.checkbox,
variable: 'crawl_sub_pages',
label: t(`${I18N_PREFIX}.crawlSubPage`),
required: false,
showConditions: [],
},
{
type: BaseFieldType.checkbox,
variable: 'only_main_content',
label: t(`${I18N_PREFIX}.extractOnlyMainContent`),
required: false,
showConditions: [],
},
]
return configurations
}
export const useSchema = () => {
const { t } = useTranslation()
const Schema = z.object({
url: z.string().nonempty({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: 'url',
}),
}).regex(/^https?:\/\//, {
message: t(`${ERROR_I18N_PREFIX}.urlError`),
}),
limit: z.number().positive({
message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: t(`${I18N_PREFIX}.limit`),
}),
}).int(),
}).passthrough()
return Schema
}

View File

@ -0,0 +1,203 @@
'use client'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useModalContextSelector } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets'
import { sleep } from '@/utils'
import Header from '@/app/components/datasets/create/website/base/header'
import type { FormData } from '../base/options'
import Options from '../base/options'
import { useConfigurations, useSchema } from './hooks'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import CrawledResult from '../base/crawled-result'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type WaterCrawlProps = {
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const WaterCrawl = ({
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: WaterCrawlProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const configurations = useConfigurations()
const schema = useSchema()
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal)
const handleSetting = useCallback(() => {
setShowAccountSettingModal({
payload: 'data-source',
})
}, [setShowAccountSettingModal])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
current: number
total: number
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const waitForCrawlFinished = useCallback(async (jobId: string): Promise<any> => {
try {
const res = await checkWatercrawlTaskStatus(jobId) as any
if (res.status === 'completed') {
return {
isError: false,
data: {
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
},
}
}
if (res.status === 'error' || !res.status) {
// can't get the error message from the watercrawl api
return {
isError: true,
errorMessage: res.message,
data: {
data: [],
},
}
}
// update the progress
setCrawlResult({
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
})
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
await sleep(2500)
return await waitForCrawlFinished(jobId)
}
catch (e: any) {
const errorBody = await e.json()
return {
isError: true,
errorMessage: errorBody.message,
data: {
data: [],
},
}
}
}, [crawlOptions.limit, onCheckedCrawlResultChange])
const handleRun = useCallback(async (value: FormData) => {
const { url, ...crawlOptions } = value
onCrawlOptionsChange(crawlOptions)
setStep(Step.running)
try {
const passToServerCrawlOptions: any = {
...crawlOptions,
}
if (crawlOptions.max_depth === '')
delete passToServerCrawlOptions.max_depth
const res = await createWatercrawlTask({
url,
options: passToServerCrawlOptions,
}) as any
const jobId = res.job_id
onJobIdChange(jobId)
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
if (isError) {
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
}
else {
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
setCrawlErrorMessage('')
}
}
catch (e) {
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
console.log(e)
}
finally {
setStep(Step.finished)
}
}, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished])
return (
<div>
<Header
isInPipeline
onClickConfiguration={handleSetting}
title={t(`${I18N_PREFIX}.watercrawlTitle`)}
buttonText={t(`${I18N_PREFIX}.configureWatercrawl`)}
docTitle={t(`${I18N_PREFIX}.watercrawlDoc`)}
docLink={'https://docs.watercrawl.dev/'}
/>
<div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle'>
<Options
initialData={{
...crawlOptions,
url: '',
}}
configurations={configurations}
isRunning={isRunning}
controlFoldOptions={controlFoldOptions}
schema={schema}
onSubmit={(value) => {
handleRun(value)
console.log('submit')
}}
/>
</div>
{!isInit && (
<div className='relative'>
{isRunning && (
<Crawling
crawledNum={crawlResult?.current || 0}
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
/>
)}
{showError && (
<ErrorMessage
className='mt-2'
title={t(`${I18N_PREFIX}.exceptionErrorTitle`)}
errorMsg={crawlErrorMessage}
/>
)}
{isCrawlFinished && !showError && (
<CrawledResult
className='mt-2'
list={crawlResult?.data || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
/>
)}
</div>
)}
</div>
)
}
export default React.memo(WaterCrawl)

View File

@ -16,11 +16,13 @@ import Notion from './data-source/notion'
import VectorSpaceFull from '@/app/components/billing/vector-space-full'
import { DEFAULT_CRAWL_OPTIONS } from './consts'
import Firecrawl from './data-source/website/firecrawl'
import JinaReader from './data-source/website/jina-reader'
import WaterCrawl from './data-source/website/water-crawl'
const TestRunPanel = () => {
const { t } = useTranslation()
const [currentStep, setCurrentStep] = useState(1)
const [dataSource, setDataSource] = useState<string>(DataSourceProvider.fireCrawl)
const [dataSource, setDataSource] = useState<string>(DataSourceProvider.waterCrawl)
const [fileList, setFiles] = useState<FileItem[]>([])
const [notionPages, setNotionPages] = useState<NotionPage[]>([])
const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([])
@ -51,8 +53,12 @@ const TestRunPanel = () => {
return nextDisabled
if (dataSource === DataSourceType.NOTION)
return isShowVectorSpaceFull || !notionPages.length
if (dataSource === DataSourceProvider.fireCrawl
|| dataSource === DataSourceProvider.jinaReader
|| dataSource === DataSourceProvider.waterCrawl)
return isShowVectorSpaceFull || !websitePages.length
return false
}, [dataSource, nextDisabled, isShowVectorSpaceFull, notionPages.length])
}, [dataSource, nextDisabled, isShowVectorSpaceFull, notionPages.length, websitePages.length])
const handleClose = () => {
setShowTestRunPanel?.(false)
@ -135,6 +141,24 @@ const TestRunPanel = () => {
onCrawlOptionsChange={setCrawlOptions}
/>
)}
{dataSource === DataSourceProvider.jinaReader && (
<JinaReader
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={setWebsitePages}
onJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
)}
{dataSource === DataSourceProvider.waterCrawl && (
<WaterCrawl
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={setWebsitePages}
onJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
)}
{isShowVectorSpaceFull && (
<VectorSpaceFull />
)}