mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-26 17:04:02 +00:00
Adding mineru script
This commit is contained in:
parent
e5a80c572c
commit
f2f761973c
@ -60,4 +60,12 @@ python olmocr/bench/runners/run_marker.py olmocr/bench/sample_data/pdfs
|
||||
|
||||
pip install verovio torchvision
|
||||
python olmocr/bench/runners/run_gotocr.py olmocr/bench/sample_data/pdfs
|
||||
|
||||
conda create -n MinerU python=3.10
|
||||
conda activate MinerU
|
||||
pip install -U magic-pdf[full]==1.1.0 --extra-index-url https://wheels.myhloli.com
|
||||
pip install huggingface_hub
|
||||
wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
||||
python download_models_hf.py
|
||||
python olmocr/bench/runners/run_mineru.py olmocr/bench/sample_data/pdfs
|
||||
```
|
||||
|
@ -0,0 +1,86 @@
|
||||
import os
|
||||
import shutil
|
||||
import argparse
|
||||
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
||||
from magic_pdf.data.dataset import PymuDocDataset
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||||
|
||||
|
||||
def run(pdf_folder):
|
||||
"""
|
||||
Convert all PDF files in the specified folder to markdown using MinerU.
|
||||
For each PDF file, the script outputs markdown files along with visual and JSON outputs.
|
||||
The outputs are saved in a folder called "mineru" (with an "images" subfolder)
|
||||
located in the same parent directory as pdf_folder.
|
||||
|
||||
:param pdf_folder: Path to the folder containing PDF files.
|
||||
"""
|
||||
# Resolve absolute paths
|
||||
pdf_folder = os.path.abspath(pdf_folder)
|
||||
parent_dir = os.path.dirname(pdf_folder)
|
||||
output_folder = os.path.join(parent_dir, "mineru")
|
||||
image_output_folder = os.path.join(output_folder, "images")
|
||||
|
||||
# Create output directories if they don't exist
|
||||
os.makedirs(image_output_folder, exist_ok=True)
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
# Initialize writers (same for all PDFs)
|
||||
image_writer = FileBasedDataWriter(image_output_folder)
|
||||
md_writer = FileBasedDataWriter(output_folder)
|
||||
|
||||
# List all PDF files in the provided folder
|
||||
pdf_files = [
|
||||
os.path.join(pdf_folder, filename)
|
||||
for filename in os.listdir(pdf_folder)
|
||||
if filename.lower().endswith(".pdf")
|
||||
]
|
||||
|
||||
for pdf_path in pdf_files:
|
||||
print(f"Processing {pdf_path}...")
|
||||
# Get file name without suffix for naming outputs
|
||||
pdf_file_name = os.path.basename(pdf_path)
|
||||
name_without_suff = pdf_file_name.split(".")[0]
|
||||
|
||||
# Read the PDF file bytes
|
||||
reader = FileBasedDataReader("")
|
||||
pdf_bytes = reader.read(pdf_path)
|
||||
|
||||
# Create dataset instance
|
||||
ds = PymuDocDataset(pdf_bytes)
|
||||
|
||||
# Inference: decide whether to run OCR mode based on dataset classification
|
||||
if ds.classify() == SupportedPdfParseMethod.OCR:
|
||||
infer_result = ds.apply(doc_analyze, ocr=True)
|
||||
pipe_result = infer_result.pipe_ocr_mode(image_writer)
|
||||
else:
|
||||
infer_result = ds.apply(doc_analyze, ocr=False)
|
||||
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
||||
|
||||
# Generate markdown content; the image directory is the basename of the images output folder
|
||||
image_dir_basename = os.path.basename(image_output_folder)
|
||||
md_content = pipe_result.get_markdown(image_dir_basename)
|
||||
|
||||
# Dump markdown file
|
||||
md_file_name = f"{name_without_suff}.md"
|
||||
pipe_result.dump_md(md_writer, md_file_name, image_dir_basename)
|
||||
|
||||
# Remove useless image folder
|
||||
shutil.rmtree(image_output_folder)
|
||||
|
||||
print(f"Finished processing {pdf_file_name}. Outputs saved to {output_folder}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert all PDF files in a folder to markdown and related outputs using MinerU."
|
||||
)
|
||||
parser.add_argument(
|
||||
"pdf_folder",
|
||||
type=str,
|
||||
help="Path to the folder containing PDF files (e.g., '/path/to/pdfs')"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
run(args.pdf_folder)
|
48
olmocr/bench/sample_data/mineru/multi_column_miss.md
Normal file
48
olmocr/bench/sample_data/mineru/multi_column_miss.md
Normal file
@ -0,0 +1,48 @@
|
||||
stakeholders has occurred in other nations, with groups and individuals refusing to risk being appropriated into the industry’s public relations ambitions. It now looks like that with vigilance, tobacco control advocates can easily foment similar distaste in many areas of the business community. Our actions sought to denormalise the tobacco industry by disrupting its efforts to take its place alongside other industries—often with considerable social credit—in the hope that it might gain by association.
|
||||
|
||||
Tobacco industry posturing about its corporate responsibility can never hide the ugly consequences of its ongoing efforts to ‘‘work with all relevant stakeholders for the preservation of opportunities for informed adults to consume tobacco products’’1 (translation: ‘‘we will build alliances with others who want to profit from tobacco use, to do all we can to counteract effective tobacco control’’). BAT has $15.4\%$ and Philip Morris $16.4\%$ of the global cigarette market.6 With 4.9 million smokers currently dying from tobacco use each year, and the industry unblinkingly concurring that its products are addictive, this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers.
|
||||
|
||||
# REFERENCES
|
||||
|
||||
1 British American Tobacco. Social Report. http://www.bat.com/204pp.
|
||||
2 Wroe D. Tobacco ad campaign angers MPs. The Age (Melbourne) 2004; May 17 http://www.theage.com.au/articles/2004/05/16/ 1084646069771.html?oneclick $=$ true.
|
||||
3 Hirschhorn N. Corporate social responsibility and the tobacco industry: hope or hype? Tobacco Control 2004;13:447–53.
|
||||
4 Ethical Corporation Asia 2004. Conference website. http:// www.ethicalcorp.com/asia2004/.
|
||||
5 Chapman S, Shatenstein S. Extreme corporate makeover: tobacco companies, corporate responsibility and the corruption of ‘‘ethics’’. Globalink petition. http://petition.globalink.org/view.php?code $=$ extreme.
|
||||
6 Mackay J, Eriksen M. The tobacco atlas. Geneva: World Health Organization, 2002.
|
||||
|
||||
# INDUSTRY WATCH
|
||||
|
||||
Corporate social responsibility and the tobacco industry: hope or hype?
|
||||
|
||||
N Hirschhorn
|
||||
|
||||
Corporate social responsibility (CSR) emerged from a realisation among transnational corporations of the need to account for and redress their adverse impact on society: specifically, on human rights, labour practices, and the environment. Two transnational tobacco companies have recently adopted CSR: Philip Morris, and British American Tobacco. This report explains the origins and theory behind CSR; examines internal company documents from Philip Morris showing the company’s deliberations on the matter, and the company’s perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.
|
||||
|
||||
Tobacco Control 2004;13:447–453. doi: 10.1136/tc.2003.006676 tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron.
|
||||
|
||||
# CORPORATE SOCIAL RESPONSIBILITY: THE CONTEXT
|
||||
|
||||
The term ‘‘corporate social responsibility’’ is in vogue at the moment but as a concept it is vague and means different things to different people.1
|
||||
|
||||
Some writers on CSR trace its American roots to the 19th century when large industries engaged in philanthropy and established great public institutions, a form of ‘‘noblesse oblige’’. But the notion that corporations should be required to return more to society because of their impact on society was driven by pressures from the civil rights, peace, and environmental movements of the last half century.2 3 The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of
|
||||
|
||||
Correspondence to: Dr Norbert Hirschhorn, Nastolantie 6, A3 00600 Helsinki, Finland; bertzpoet@yahoo.com
|
||||
|
||||
Received 13 November 2003 Accepted 15 July 2004
|
||||
|
||||
ver the past three decades increasing pressure from non-governmental organisations (NGOs), governments and the
|
||||
United Nations, has required transnational cor
|
||||
porations (TNCs) to examine and redress the
|
||||
adverse impact their businesses have on society
|
||||
and the environment. Many have responded by
|
||||
taking up what is known as ‘‘corporate social
|
||||
responsibility’’ (CSR); only recently have two
|
||||
major cigarette companies followed suit: Philip
|
||||
Morris (PM) and British American Tobacco
|
||||
(BAT). This report first provides the context
|
||||
and development of CSR; then, from internal
|
||||
company documents, examines how PM came to
|
||||
its own version. This paper examines whether a
|
||||
|
||||
Abbreviations: ASH, Action on Smoking and Health; BAT, British American Tobacco; CERES, Coalition for Environmentally Responsible Economies; CSR, corporate social responsibility; DJSI, Dow Jones Sustainability Index; GCAC, Global Corporate Affairs Council; GRI, Global Reporting Initiative; MSA, Master Settlement Agreement; $N G O s,$ non-governmental organisations; PM, Philip Morris; TNCs, transnational corporations; UNEP, United Nations Environment Program
|
Loading…
x
Reference in New Issue
Block a user