dify/web/app/components/base/markdown/markdown-utils.ts

/**
 * @fileoverview Utility functions for preprocessing Markdown content.
 * These functions were extracted from the main markdown renderer for better separation of concerns.
 * Includes preprocessing for LaTeX and custom "think" tags.
 */
import { flow } from 'lodash-es'

export const preprocessLaTeX = (content: string) => {
  if (typeof content !== 'string')
    return content

  const codeBlockRegex = /```[\s\S]*?```/g
  const codeBlocks = content.match(codeBlockRegex) || []
  let processedContent = content.replace(codeBlockRegex, 'CODE_BLOCK_PLACEHOLDER')

  processedContent = flow([
    (str: string) => str.replace(/\\\[(.*?)\\\]/g, (_, equation) => `$$${equation}$$`),
    (str: string) => str.replace(/\\\[([\s\S]*?)\\\]/g, (_, equation) => `$$${equation}$$`),
    (str: string) => str.replace(/\\\((.*?)\\\)/g, (_, equation) => `$$${equation}$$`),
    (str: string) => str.replace(/(^|[^\\])\$(.+?)\$/g, (_, prefix, equation) => `${prefix}$${equation}$`),
  ])(processedContent)

  codeBlocks.forEach((block) => {
    processedContent = processedContent.replace('CODE_BLOCK_PLACEHOLDER', block)
  })

  return processedContent
}

export const preprocessThinkTag = (content: string) => {
  const thinkOpenTagRegex = /(<think>\n)+/g
  const thinkCloseTagRegex = /\n<\/think>/g
  return flow([
    (str: string) => str.replace(thinkOpenTagRegex, '<details data-think=true>\n'),
    (str: string) => str.replace(thinkCloseTagRegex, '\n[ENDTHINKFLAG]</details>'),
    (str: string) => str.replace(/(<\/details>)(?![^\S\r\n]*[\r\n])(?![^\S\r\n]*$)/g, '$1\n'),
  ])(content)
}

/**
 * Transforms a URI for use in react-markdown, ensuring security and compatibility.
 * This function is designed to work with react-markdown v9+ which has stricter
 * default URL handling.
 *
 * Behavior:
 * 1. Always allows the custom 'abbr:' protocol.
 * 2. Always allows page-local fragments (e.g., "#some-id").
 * 3. Always allows protocol-relative URLs (e.g., "//example.com/path").
 * 4. Always allows purely relative paths (e.g., "path/to/file", "/abs/path").
 * 5. Allows absolute URLs if their scheme is in a permitted list (case-insensitive):
 *    'http:', 'https:', 'mailto:', 'xmpp:', 'irc:', 'ircs:'.
 * 6. Intelligently distinguishes colons used for schemes from colons within
 *    paths, query parameters, or fragments of relative-like URLs.
 * 7. Returns the original URI if allowed, otherwise returns `undefined` to
 *    signal that the URI should be removed/disallowed by react-markdown.
 */
export const customUrlTransform = (uri: string): string | undefined => {
  const PERMITTED_SCHEME_REGEX = /^(https?|ircs?|mailto|xmpp|abbr):$/i

  if (uri.startsWith('#'))
    return uri

  if (uri.startsWith('//'))
    return uri

  const colonIndex = uri.indexOf(':')

  if (colonIndex === -1)
    return uri

  const slashIndex = uri.indexOf('/')
  const questionMarkIndex = uri.indexOf('?')
  const hashIndex = uri.indexOf('#')

  if (
    (slashIndex !== -1 && colonIndex > slashIndex)
    || (questionMarkIndex !== -1 && colonIndex > questionMarkIndex)
    || (hashIndex !== -1 && colonIndex > hashIndex)
  )
    return uri

  const scheme = uri.substring(0, colonIndex + 1).toLowerCase()
  if (PERMITTED_SCHEME_REGEX.test(scheme))
    return uri

  return undefined
}
Refactor/markdown component split (#20177) 2025-05-30 11:31:50 +08:00			`/**`
			`* @fileoverview Utility functions for preprocessing Markdown content.`
			`* These functions were extracted from the main markdown renderer for better separation of concerns.`
			`* Includes preprocessing for LaTeX and custom "think" tags.`
			`*/`
			`import { flow } from 'lodash-es'`

			`export const preprocessLaTeX = (content: string) => {`
			`if (typeof content !== 'string')`
			`return content`

			const codeBlockRegex = /```[\s\S]*?```/g
			`const codeBlocks = content.match(codeBlockRegex) \|\| []`
			`let processedContent = content.replace(codeBlockRegex, 'CODE_BLOCK_PLACEHOLDER')`

			`processedContent = flow([`
			(str: string) => str.replace(/\\\[(.*?)\\\]/g, (_, equation) => `$$${equation}$$`),
			(str: string) => str.replace(/\\\[([\s\S]*?)\\\]/g, (_, equation) => `$$${equation}$$`),
			(str: string) => str.replace(/\\\((.*?)\\\)/g, (_, equation) => `$$${equation}$$`),
			(str: string) => str.replace(/(^\|[^\\])\$(.+?)\$/g, (_, prefix, equation) => `${prefix}$${equation}$`),
			`])(processedContent)`

			`codeBlocks.forEach((block) => {`
			`processedContent = processedContent.replace('CODE_BLOCK_PLACEHOLDER', block)`
			`})`

			`return processedContent`
			`}`

			`export const preprocessThinkTag = (content: string) => {`
fix Multiple <think>\n Interface rendering exception (#20977) 2025-06-18 11:31:04 +08:00			`const thinkOpenTagRegex = /(<think>\n)+/g`
Refactor/markdown component split (#20177) 2025-05-30 11:31:50 +08:00			`const thinkCloseTagRegex = /\n<\/think>/g`
			`return flow([`
			`(str: string) => str.replace(thinkOpenTagRegex, '<details data-think=true>\n'),`
			`(str: string) => str.replace(thinkCloseTagRegex, '\n[ENDTHINKFLAG]</details>'),`
fix: ensure newlines around think tags for proper markdown rendering (#20594) 2025-06-03 18:56:09 +08:00			`(str: string) => str.replace(/(<\/details>)(?![^\S\r\n][\r\n])(?![^\S\r\n]$)/g, '$1\n'),`
Refactor/markdown component split (#20177) 2025-05-30 11:31:50 +08:00			`])(content)`
			`}`
fix(markdown): Ensure abbr: links render correctly in react-markdown v9+ (#20648) 2025-06-04 19:52:12 +08:00
			`/**`
			`* Transforms a URI for use in react-markdown, ensuring security and compatibility.`
			`* This function is designed to work with react-markdown v9+ which has stricter`
			`* default URL handling.`
			`*`
			`* Behavior:`
			`* 1. Always allows the custom 'abbr:' protocol.`
			`* 2. Always allows page-local fragments (e.g., "#some-id").`
			`* 3. Always allows protocol-relative URLs (e.g., "//example.com/path").`
			`* 4. Always allows purely relative paths (e.g., "path/to/file", "/abs/path").`
			`* 5. Allows absolute URLs if their scheme is in a permitted list (case-insensitive):`
			`* 'http:', 'https:', 'mailto:', 'xmpp:', 'irc:', 'ircs:'.`
			`* 6. Intelligently distinguishes colons used for schemes from colons within`
			`* paths, query parameters, or fragments of relative-like URLs.`
			* 7. Returns the original URI if allowed, otherwise returns `undefined` to
			`* signal that the URI should be removed/disallowed by react-markdown.`
			`*/`
			`export const customUrlTransform = (uri: string): string \| undefined => {`
			`const PERMITTED_SCHEME_REGEX = /^(https?\|ircs?\|mailto\|xmpp\|abbr):$/i`

			`if (uri.startsWith('#'))`
			`return uri`

			`if (uri.startsWith('//'))`
			`return uri`

			`const colonIndex = uri.indexOf(':')`

			`if (colonIndex === -1)`
			`return uri`

			`const slashIndex = uri.indexOf('/')`
			`const questionMarkIndex = uri.indexOf('?')`
			`const hashIndex = uri.indexOf('#')`

			`if (`
			`(slashIndex !== -1 && colonIndex > slashIndex)`
			`\|\| (questionMarkIndex !== -1 && colonIndex > questionMarkIndex)`
			`\|\| (hashIndex !== -1 && colonIndex > hashIndex)`
			`)`
			`return uri`

			`const scheme = uri.substring(0, colonIndex + 1).toLowerCase()`
			`if (PERMITTED_SCHEME_REGEX.test(scheme))`
			`return uri`

			`return undefined`
			`}`