mirror of
https://github.com/microsoft/markitdown.git
synced 2025-06-26 22:00:21 +00:00
feat(docker): improve dockerfile build (#220)
* refactor(docker): remove unnecessary root user The USER root directive isn't needed directly after FROM Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): use generic nobody nogroup default instead of uid gid Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): build app from source locally instead of installing package Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): use correct files in dockerignore Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * chore(docker): dont install recommended packages with git Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): run apt as non-interactive Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * Update Dockerfile to new package structure, and fix streaming bugs. --------- Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> Co-authored-by: afourney <adamfo@microsoft.com>
This commit is contained in:
parent
0229ff6cb7
commit
515fa854bf
@ -1 +1,2 @@
|
||||
*
|
||||
*
|
||||
!packages/
|
||||
|
30
Dockerfile
30
Dockerfile
@ -1,22 +1,32 @@
|
||||
FROM python:3.13-slim-bullseye
|
||||
|
||||
USER root
|
||||
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
|
||||
# Runtime dependency
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
exiftool
|
||||
|
||||
RUN pip install markitdown
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git; \
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN pip --no-cache-dir install \
|
||||
/app/packages/markitdown[all] \
|
||||
/app/packages/markitdown-sample-plugin
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=10000
|
||||
ARG GROUPID=10000
|
||||
ARG USERID=nobody
|
||||
ARG GROUPID=nogroup
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
|
@ -327,6 +327,17 @@ class MarkItDown:
|
||||
elif base_guess.extension is not None:
|
||||
placeholder_filename = "placeholder" + base_guess.extension
|
||||
|
||||
# Check if we have a seekable stream. If not, load the entire stream into memory.
|
||||
if not stream.seekable():
|
||||
buffer = io.BytesIO()
|
||||
while True:
|
||||
chunk = stream.read(4096)
|
||||
if not chunk:
|
||||
break
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
stream = buffer
|
||||
|
||||
# Add guesses based on stream content
|
||||
for guess in _guess_stream_info_from_stream(
|
||||
file_stream=stream, filename_hint=placeholder_filename
|
||||
|
@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
olefile = None
|
||||
try:
|
||||
import olefile
|
||||
except ImportError:
|
||||
@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
# Brute force, check if we have an OLE file
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
if not olefile.isOleFile(file_stream):
|
||||
if olefile and not olefile.isOleFile(file_stream):
|
||||
return False
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
Loading…
x
Reference in New Issue
Block a user