2024-04-17 11:04:03 -07:00
import { WebScraperDataProvider } from "../index" ;
2024-04-17 14:54:54 -03:00
2024-04-17 11:04:03 -07:00
describe ( "WebScraperDataProvider" , ( ) = > {
describe ( "replaceImgPathsWithAbsolutePaths" , ( ) = > {
it ( "should replace image paths with absolute paths" , ( ) = > {
2024-04-17 14:54:54 -03:00
const webScraperDataProvider = new WebScraperDataProvider ( ) ;
const documents = [
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/page" } ,
content : "" ,
2024-04-17 14:54:54 -03:00
} ,
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/another-page" } ,
content : "" ,
2024-04-17 14:54:54 -03:00
} ,
2024-04-17 18:24:46 -07:00
{
metadata : { sourceURL : "https://example.com/another-page" } ,
content : "" ,
} ,
2024-04-17 14:54:54 -03:00
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/data-image" } ,
content : "" ,
} ,
2024-04-17 14:54:54 -03:00
] ;
const expectedDocuments = [
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/page" } ,
content : "" ,
2024-04-17 14:54:54 -03:00
} ,
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/another-page" } ,
content : "" ,
2024-04-17 14:54:54 -03:00
} ,
2024-04-17 18:24:46 -07:00
{
metadata : { sourceURL : "https://example.com/another-page" } ,
content : "" ,
} ,
2024-04-17 14:54:54 -03:00
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/data-image" } ,
content : "" ,
} ,
2024-04-17 14:54:54 -03:00
] ;
2024-04-17 11:04:03 -07:00
const result =
webScraperDataProvider . replaceImgPathsWithAbsolutePaths ( documents ) ;
2024-04-17 14:54:54 -03:00
expect ( result ) . toEqual ( expectedDocuments ) ;
} ) ;
2024-04-17 11:04:03 -07:00
it ( "should handle absolute URLs without modification" , ( ) = > {
2024-04-17 14:54:54 -03:00
const webScraperDataProvider = new WebScraperDataProvider ( ) ;
const documents = [
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/page" } ,
content : "" ,
2024-04-17 14:54:54 -03:00
} ,
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/another-page" } ,
content :
"" ,
} ,
2024-04-17 14:54:54 -03:00
] ;
const expectedDocuments = [
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/page" } ,
content : "" ,
2024-04-17 14:54:54 -03:00
} ,
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/another-page" } ,
content :
"" ,
} ,
2024-04-17 14:54:54 -03:00
] ;
2024-04-17 11:04:03 -07:00
const result =
webScraperDataProvider . replaceImgPathsWithAbsolutePaths ( documents ) ;
2024-04-17 14:54:54 -03:00
expect ( result ) . toEqual ( expectedDocuments ) ;
} ) ;
2024-04-17 11:04:03 -07:00
it ( "should not replace non-image content within the documents" , ( ) = > {
2024-04-17 14:54:54 -03:00
const webScraperDataProvider = new WebScraperDataProvider ( ) ;
const documents = [
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/page" } ,
content :
"This is a test.  Here is a link: [Example](https://example.com)." ,
2024-04-17 14:54:54 -03:00
} ,
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/another-page" } ,
content :
"Another test.  Here is some **bold text**." ,
} ,
2024-04-17 14:54:54 -03:00
] ;
2024-04-17 11:04:03 -07:00
2024-04-17 14:54:54 -03:00
const expectedDocuments = [
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/page" } ,
content :
"This is a test.  Here is a link: [Example](https://example.com)." ,
} ,
{
metadata : { sourceURL : "https://example.com/another-page" } ,
content :
"Another test.  Here is some **bold text**." ,
} ,
] ;
const result =
webScraperDataProvider . replaceImgPathsWithAbsolutePaths ( documents ) ;
expect ( result ) . toEqual ( expectedDocuments ) ;
} ) ;
it ( "should replace multiple image paths within the documents" , ( ) = > {
const webScraperDataProvider = new WebScraperDataProvider ( ) ;
const documents = [
{
metadata : { sourceURL : "https://example.com/page" } ,
content :
"This is a test.  Here is a link: [Example](https://example.com). " ,
2024-04-17 14:54:54 -03:00
} ,
{
2024-04-17 11:04:03 -07:00
metadata : { sourceURL : "https://example.com/another-page" } ,
content :
"Another test.  Here is some **bold text**. " ,
} ,
2024-04-17 14:54:54 -03:00
] ;
2024-04-17 11:04:03 -07:00
const expectedDocuments = [
{
metadata : { sourceURL : "https://example.com/page" } ,
content :
"This is a test.  Here is a link: [Example](https://example.com). " ,
} ,
{
metadata : { sourceURL : "https://example.com/another-page" } ,
content :
"Another test.  Here is some **bold text**. " ,
} ,
] ;
const result =
webScraperDataProvider . replaceImgPathsWithAbsolutePaths ( documents ) ;
expect ( result ) . toEqual ( expectedDocuments ) ;
} ) ;
it ( "should replace image paths within the documents with complex URLs" , ( ) = > {
const webScraperDataProvider = new WebScraperDataProvider ( ) ;
const documents = [
{
metadata : { sourceURL : "https://example.com/page/subpage" } ,
content :
"This is a test.  Here is a link: [Example](https://example.com). " ,
} ,
{
metadata : { sourceURL : "https://example.com/another-page/subpage" } ,
content :
"Another test.  Here is some **bold text**. " ,
} ,
] ;
const expectedDocuments = [
{
metadata : { sourceURL : "https://example.com/page/subpage" } ,
content :
"This is a test.  Here is a link: [Example](https://example.com). " ,
} ,
{
metadata : { sourceURL : "https://example.com/another-page/subpage" } ,
content :
"Another test.  Here is some **bold text**. " ,
} ,
] ;
const result =
webScraperDataProvider . replaceImgPathsWithAbsolutePaths ( documents ) ;
2024-04-17 14:54:54 -03:00
expect ( result ) . toEqual ( expectedDocuments ) ;
} ) ;
} ) ;
2024-04-17 11:04:03 -07:00
} ) ;