mirror of
https://github.com/allenai/olmocr.git
synced 2025-07-03 07:05:50 +00:00
Bump version to v0.1.68 for release
This commit is contained in:
parent
db9972c39a
commit
d2755adf55
@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
|
## [v0.1.68](https://github.com/allenai/olmocr/releases/tag/v0.1.68) - 2025-05-19
|
||||||
|
|
||||||
## [v0.1.60](https://github.com/allenai/olmocr/releases/tag/v0.1.60) - 2025-03-17
|
## [v0.1.60](https://github.com/allenai/olmocr/releases/tag/v0.1.60) - 2025-03-17
|
||||||
|
|
||||||
## [v0.1.58](https://github.com/allenai/olmocr/releases/tag/v0.1.58) - 2025-02-15
|
## [v0.1.58](https://github.com/allenai/olmocr/releases/tag/v0.1.58) - 2025-02-15
|
||||||
|
1
olmOCR-bench
Submodule
1
olmOCR-bench
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 346419907b212a6c25906c0e17c15e186af9708c
|
@ -8,5 +8,5 @@ echo "$VERSION"
|
|||||||
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t olmocr-inference-$VERSION .
|
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t olmocr-inference-$VERSION .
|
||||||
beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION
|
beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION
|
||||||
|
|
||||||
# docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-tagging -t olmocr-tagging-$VERSION .
|
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-tagging -t olmocr-tagging-$VERSION .
|
||||||
# beaker image create --workspace ai2/oe-data-pdf --name olmocr-tagging-$VERSION olmocr-tagging-$VERSION
|
beaker image create --workspace ai2/oe-data-pdf --name olmocr-tagging-$VERSION olmocr-tagging-$VERSION
|
@ -4,9 +4,21 @@ set -e
|
|||||||
|
|
||||||
python scripts/pii_rule_comparison.py \
|
python scripts/pii_rule_comparison.py \
|
||||||
--docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
|
--docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
|
||||||
--ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5" \
|
--ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5 and \
|
||||||
--hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.4" \
|
fineweb_edu_fasttext_gt2__fineweb_edu_fasttext_gt2__score:avg>0.001 and \
|
||||||
|
avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_ratio:avg<0.2 and \
|
||||||
|
pipe_delimited_lines_v1__pipe_delimited_lines_v1__pipe_delimited_lines_ratio:avg<0.3 \
|
||||||
|
" \
|
||||||
|
--hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5 and \
|
||||||
|
fineweb_edu_fasttext_gt2__fineweb_edu_fasttext_gt2__score:avg>0.001 and \
|
||||||
|
avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_ratio:avg<0.2 and \
|
||||||
|
pipe_delimited_lines_v1__pipe_delimited_lines_v1__pipe_delimited_lines_ratio:avg<0.4 \
|
||||||
|
" \
|
||||||
--output-dir results/pii_detection \
|
--output-dir results/pii_detection \
|
||||||
|
|
||||||
|
|
||||||
|
# Run1, langid, pipes and numbers
|
||||||
|
# Prompt, boilerplate, reference, prose, table classification -> train fasttext
|
||||||
|
# 50k docs to train fast text
|
||||||
|
|
||||||
tinyhost results/pii_detection/*
|
tinyhost results/pii_detection/*
|
Loading…
x
Reference in New Issue
Block a user