Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							d5929af010
							
						
					 | 
					
						
						
							
							fix(queue-worker/kickoff): make crawls wait for kickoff to finish (matters on big sitemapped sites)
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-17 16:04:01 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							23bb172592
							
						
					 | 
					
						
						
							
							fix(crawler): recognize sitemaps in robots.txt
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-17 15:45:52 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							faf58dfca7
							
						
					 | 
					
						
						
							
							fix(removeUnwantedElements): post-includeTags excludeTags
						
						
						
						
						
						
						
						Fixes #700 
						
						
							
						
					 | 
					
						2025-01-17 12:41:00 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							de08b37480
							
						
					 | 
					
						
						
							
							feat: adjust CI testing
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-17 11:51:46 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							4a947e385f
							
						
					 | 
					
						
						
							
							fix(queue-worker): fill out time taken on failure too
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-17 11:28:37 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							8e57fdec2c
							
						
					 | 
					
						
						
							
							Update package.json
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-16 14:02:25 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							80e5acf68c
							
						
					 | 
					
						
						
							
							Nick: error details for extract
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-16 14:02:15 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							6c94db7ed0
							
						
					 | 
					
						
						
							
							fix(html,markdown): always get absolute links
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-16 16:56:13 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							e824303d87
							
						
					 | 
					
						
						
							
							feat(html): always pick largest image from srcset
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-16 16:51:33 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							655753cd27
							
						
					 | 
					
						
						
							
							fix(url): allow domains with ports
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-16 16:30:14 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							de14c0a45d
							
						
					 | 
					
						
						
							
							Update package.json
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-15 18:12:34 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							c3937996b1
							
						
					 | 
					
						
						
							
							feat(js-sdk): add further options to checkxstatus
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-15 20:16:39 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							cbe67d89a5
							
						
					 | 
					
						
						
							
							feat(queue-worker): proactive job cancel
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-15 19:02:20 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							ec039dcb8f
							
						
					 | 
					
						
						
							
							fix(blocklist): unblock
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-15 18:54:26 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							dde3aebac4
							
						
					 | 
					
						
						
							
							fix(v1/crawl-status): fix stuck on 0 jobs
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-15 18:51:39 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							ce2f6ff884
							
						
					 | 
					
						
						
							
							fix(queue-worker/billing): fix crawl overbilling
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-15 17:22:52 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							db89e365eb
							
						
					 | 
					
						
						
							
							Update check-fire-engine.ts
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-15 01:16:42 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							957eea4113
							
						
					 | 
					
						
						
							
							Nick: extract without a schema should work as expected
						
						
						
						
						
						
							
 v.1.3.0
						
					 | 
					
						2025-01-14 11:37:00 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							61e6af2b16
							
						
					 | 
					
						
						
							
							Nick: streaming callback experimental
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-14 02:13:42 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							23d3257a57
							
						
					 | 
					
						
						
							
							Merge branch 'nsc/__experimental_streamSteps'
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-14 02:00:58 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							c323c64671
							
						
					 | 
					
						
						
							
							Update extract-redis.ts
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-14 02:00:47 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							2dc87a2e1c
							
						
					 | 
					
						
						
							
							Update extraction-service.ts
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-14 01:59:52 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
							
							
						
						
						
							
						
						
							0496b793c4
							
						
					 | 
					
						
						
							
							Merge pull request #1063 from mendableai/nsc/__experimental_streamSteps
						
						
						
						
						
						
						
						__experimental_streamSteps 
						
						
							
						
					 | 
					
						2025-01-14 01:48:13 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							033e9bbf29
							
						
					 | 
					
						
						
							
							Nick: __experimental_streamSteps
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-14 01:45:50 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							558a7f4c08
							
						
					 | 
					
						
						
							
							Update package.json
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-14 01:35:29 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							9759f18725
							
						
					 | 
					
						
						
							
							Nick: temp file fixes
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-13 23:56:53 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							ac6650e488
							
						
					 | 
					
						
						
							
							Update requests.http
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-13 22:31:54 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
							
							
						
						
						
							
						
						
							5e5b5ee0e2
							
						
					 | 
					
						
						
							
							(feat/extract) New re-ranker + multi entity extraction (#1061)
						
						
						
						
						
						
						
						* agent that decides if splits schema or not
* split and merge properties done
* wip
* wip
* changes
* ch
* array merge working!
* comment
* wip
* dereferentiate schema
* dereference schemas
* Nick: new re-ranker
* Create llm-links.txt
* Nick: format
* Update extraction-service.ts
* wip: cooking schema mix and spread functions
* wip
* wip getting there!!!
* nick:
* moved functions to helpers
* nick:
* cant reproduce the error anymore
* error handling all scrapes failed
* fix
* Nick: added the sitemap index
* Update sitemap-index.ts
* Update map.ts
* deduplicate and merge arrays
* added error handler for object transformations
* Update url-processor.ts
* Nick:
* Nick: fixes
* Nick: big improvements to rerank of multi-entity
* Nick: working
* Update reranker.ts
* fixed transformations for nested objs
* fix merge nulls
* Nick: fixed error piping
* Update queue-worker.ts
* Update extraction-service.ts
* Nick: format
* Update queue-worker.ts
* Update pnpm-lock.yaml
* Update queue-worker.ts
---------
Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Co-authored-by: Thomas Kosmas <thomas510111@gmail.com> 
						
						
							
						
					 | 
					
						2025-01-13 22:30:15 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
							
							
						
						
						
							
						
						
							5c62bb1195
							
						
					 | 
					
						
						
							
							feat: new snips test framework (FIR-414) (#1033)
						
						
						
						
						
						
						
						* feat: new snips test framework
* Update mock.ts
---------
Co-authored-by: Nicolas <nicolascamara29@gmail.com> 
						
						
							
						
					 | 
					
						2025-01-13 20:50:47 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							9a13c1dede
							
						
					 | 
					
						
						
							
							Nick: fixes to extract rephrase prompt
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-11 20:22:36 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							a82160a630
							
						
					 | 
					
						
						
							
							Update crawl-redis.ts
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-10 21:31:23 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							f4d10c5031
							
						
					 | 
					
						
						
							
							Nick: formatting fixes
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-10 18:35:10 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							d1f3b96388
							
						
					 | 
					
						
						
							
							feat: add scrapeId in document.metadata
						
						
						
						
						
						
							
 v1.2.1
						
					 | 
					
						2025-01-09 20:52:12 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							29c1f126ab
							
						
					 | 
					
						
						
							
							feat(scrape-status): adapt
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-09 19:14:00 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							2849ce2f13
							
						
					 | 
					
						
						
							
							fix(queue-worker): errored job logging
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-09 18:48:47 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							97bf54214f
							
						
					 | 
					
						
						
							
							fix(scrapeURL/loop): re-add is long enough check with lt 0
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-09 18:43:50 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gergő Móricz
							
						 
					 | 
					
						
						
						
						
							
						
						
							0da386914d
							
						
					 | 
					
						
						
							
							fix(queue-worker): graceful shutdown
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-09 16:04:59 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							3c614a2e5c
							
						
					 | 
					
						
						
							
							fix(scrapeURL/engines/pdf,docx): support authorization
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-09 10:03:27 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							49e584f8e1
							
						
					 | 
					
						
						
							
							fix(queue-worker/crawl): use SCARD to generate num_docs field
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-09 09:51:34 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							9e8c629ff4
							
						
					 | 
					
						
						
							
							fix(log_job): don't redact with auth header
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-09 09:51:34 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							14f696805c
							
						
					 | 
					
						
						
							
							Update auth.ts
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-08 17:04:57 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							51cb4b1615
							
						
					 | 
					
						
						
							
							Nick: temp rl for /extract
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-08 15:24:38 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							a199208e21
							
						
					 | 
					
						
						
							
							Update rate-limiter.ts
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-08 15:15:21 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							aa31508ccd
							
						
					 | 
					
						
						
							
							Nick: links-billed update (temp)
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-08 15:13:33 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							363021ea78
							
						
					 | 
					
						
						
							
							feat(crawl): ensure url trimming
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-08 12:35:42 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Móricz Gergő
							
						 
					 | 
					
						
						
						
						
							
						
						
							977a3e13c5
							
						
					 | 
					
						
						
							
							fix(scrapeURL): remove short content check
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-08 11:23:25 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							0a41fdd35d
							
						
					 | 
					
						
						
							
							Merge branch 'nsc/extract-queue'
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-07 18:21:57 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							7918d0e1c9
							
						
					 | 
					
						
						
							
							Nick: bump 1.12.0
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-07 18:20:56 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
							
							
						
						
						
							
						
						
							f82a742cd1
							
						
					 | 
					
						
						
							
							Merge pull request #1044 from mendableai/nsc/extract-queue
						
						
						
						
						
						
						
						(feat/extract) Move extract to a queue system 
						
						
							
						
					 | 
					
						2025-01-07 18:10:46 -03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nicolas
							
						 
					 | 
					
						
						
						
						
							
						
						
							b98e289f03
							
						
					 | 
					
						
						
							
							Nick:
						
						
						
						
						
						
							
						
					 | 
					
						2025-01-07 17:49:21 -03:00 | 
					
					
						
						
							
							
							
						
					 |