mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-08 13:54:31 +00:00
build: Upgrade transformers to the latest version 4.34.1 (#5994)
* Upgrade transformers to the latest version 4.34.0 so that Haystack can support the new Mistral, Nougat, and other models. * update release notes * updated missing lazy import * Update .github workflows imports * bump more versions in .github workflows * rever import sorting * Update to catch runtime errors to match haystack_hub changes * add language parameter value to whisper test * bump transformers version in linting preview workflow * bump transformers version in linting preview workflow * bump version to v4.34.1 * resolve mypy issue with reused variables * install openai-whisper without dependencies * remove audio extra, update whisper install instructions * remove audio extra, update whisper install instructions * keep audio extra but add version * keep audio extra with no constraints * remove audio extra --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
parent
b9b7d7666d
commit
1cf70d3dce
4
.github/workflows/e2e_preview.yml
vendored
4
.github/workflows/e2e_preview.yml
vendored
@ -36,7 +36,9 @@ jobs:
|
|||||||
sudo apt install ffmpeg # for local Whisper tests
|
sudo apt install ffmpeg # for local Whisper tests
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: pytest e2e/preview
|
run: pytest e2e/preview
|
||||||
|
|||||||
5
.github/workflows/linting.yml
vendored
5
.github/workflows/linting.yml
vendored
@ -39,7 +39,9 @@ jobs:
|
|||||||
python-version: ${{ env.PYTHON_VERSION }}
|
python-version: ${{ env.PYTHON_VERSION }}
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install ".[all,dev]"
|
run: |
|
||||||
|
pip install ".[all,dev]"
|
||||||
|
pip install --no-deps llvmlite numba "openai-whisper>=20230918"
|
||||||
|
|
||||||
- name: Mypy
|
- name: Mypy
|
||||||
if: steps.files.outputs.any_changed == 'true'
|
if: steps.files.outputs.any_changed == 'true'
|
||||||
@ -74,6 +76,7 @@ jobs:
|
|||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: |
|
run: |
|
||||||
pip install ".[all,dev]"
|
pip install ".[all,dev]"
|
||||||
|
pip install --no-deps llvmlite numba "openai-whisper>=20230918"
|
||||||
pip install ./haystack-linter
|
pip install ./haystack-linter
|
||||||
|
|
||||||
- name: Pylint
|
- name: Pylint
|
||||||
|
|||||||
7
.github/workflows/linting_preview.yml
vendored
7
.github/workflows/linting_preview.yml
vendored
@ -38,7 +38,9 @@ jobs:
|
|||||||
python-version: ${{ env.PYTHON_VERSION }}
|
python-version: ${{ env.PYTHON_VERSION }}
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Mypy
|
- name: Mypy
|
||||||
if: steps.files.outputs.any_changed == 'true'
|
if: steps.files.outputs.any_changed == 'true'
|
||||||
@ -72,7 +74,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: |
|
run: |
|
||||||
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
pip install ./haystack-linter
|
pip install ./haystack-linter
|
||||||
|
|
||||||
- name: Pylint
|
- name: Pylint
|
||||||
|
|||||||
8
.github/workflows/tests.yml
vendored
8
.github/workflows/tests.yml
vendored
@ -202,7 +202,9 @@ jobs:
|
|||||||
python-version: ${{ env.PYTHON_VERSION }}
|
python-version: ${{ env.PYTHON_VERSION }}
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[preview,dev] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Run
|
- name: Run
|
||||||
run: pytest --cov-report xml:coverage.xml --cov="haystack" -m "unit" test/preview
|
run: pytest --cov-report xml:coverage.xml --cov="haystack" -m "unit" test/preview
|
||||||
@ -946,7 +948,9 @@ jobs:
|
|||||||
sudo apt install ffmpeg # for local Whisper tests
|
sudo apt install ffmpeg # for local Whisper tests
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
16
.github/workflows/tests_preview.yml
vendored
16
.github/workflows/tests_preview.yml
vendored
@ -116,7 +116,9 @@ jobs:
|
|||||||
python-version: ${{ env.PYTHON_VERSION }}
|
python-version: ${{ env.PYTHON_VERSION }}
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Run
|
- name: Run
|
||||||
run: pytest -m "unit" test/preview
|
run: pytest -m "unit" test/preview
|
||||||
@ -175,7 +177,9 @@ jobs:
|
|||||||
sudo apt install ffmpeg # for local Whisper tests
|
sudo apt install ffmpeg # for local Whisper tests
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Run
|
- name: Run
|
||||||
run: pytest --maxfail=5 -m "integration" test/preview
|
run: pytest --maxfail=5 -m "integration" test/preview
|
||||||
@ -230,7 +234,9 @@ jobs:
|
|||||||
colima start
|
colima start
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Run Tika
|
- name: Run Tika
|
||||||
run: docker run -d -p 9998:9998 apache/tika:2.9.0.0
|
run: docker run -d -p 9998:9998 apache/tika:2.9.0.0
|
||||||
@ -282,7 +288,9 @@ jobs:
|
|||||||
python-version: ${{ env.PYTHON_VERSION }}
|
python-version: ${{ env.PYTHON_VERSION }}
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
run: |
|
||||||
|
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' pypdf tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||||
|
pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper
|
||||||
|
|
||||||
- name: Run
|
- name: Run
|
||||||
run: pytest --maxfail=5 -m "integration" test/preview -k 'not tika'
|
run: pytest --maxfail=5 -m "integration" test/preview -k 'not tika'
|
||||||
|
|||||||
@ -28,6 +28,8 @@ class WhisperTranscriber(BaseComponent):
|
|||||||
|
|
||||||
To use Whisper locally, install it following the instructions on
|
To use Whisper locally, install it following the instructions on
|
||||||
the Whisper [GitHub repo](https://github.com/openai/whisper) and omit the `api_key` parameter.
|
the Whisper [GitHub repo](https://github.com/openai/whisper) and omit the `api_key` parameter.
|
||||||
|
You can work around a dependency conflict caused by openai-whisper pinning an older tiktoken version than required
|
||||||
|
by Haystack if you install via `pip install --no-deps numba llvmlite 'openai-whisper>=20230918'`.
|
||||||
|
|
||||||
To use the API implementation, provide an api_key. You can get one by signing up
|
To use the API implementation, provide an api_key. You can get one by signing up
|
||||||
for an [OpenAI account](https://beta.openai.com/).
|
for an [OpenAI account](https://beta.openai.com/).
|
||||||
|
|||||||
@ -6,7 +6,10 @@ from pathlib import Path
|
|||||||
from haystack.preview import component, Document, default_to_dict, ComponentError
|
from haystack.preview import component, Document, default_to_dict, ComponentError
|
||||||
from haystack.preview.lazy_imports import LazyImport
|
from haystack.preview.lazy_imports import LazyImport
|
||||||
|
|
||||||
with LazyImport("Run 'pip install openai-whisper'") as whisper_import:
|
with LazyImport(
|
||||||
|
"Run 'pip install transformers[torch]==4.34.1' to install torch and "
|
||||||
|
"'pip install --no-deps numba llvmlite 'openai-whisper>=20230918'' to install whisper."
|
||||||
|
) as whisper_import:
|
||||||
import torch
|
import torch
|
||||||
import whisper
|
import whisper
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from haystack.preview.lazy_imports import LazyImport
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]==4.32.1'") as torch_and_transformers_import:
|
with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]==4.34.1'") as torch_and_transformers_import:
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||||
|
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from haystack.preview import component, default_to_dict, ComponentError, Documen
|
|||||||
from haystack.preview.lazy_imports import LazyImport
|
from haystack.preview.lazy_imports import LazyImport
|
||||||
|
|
||||||
with LazyImport(
|
with LazyImport(
|
||||||
"Run 'pip install transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0'"
|
"Run 'pip install transformers[torch,sentencepiece]==4.34.1 sentence-transformers>=2.2.0'"
|
||||||
) as torch_and_transformers_import:
|
) as torch_and_transformers_import:
|
||||||
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
|
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
|
||||||
from tokenizers import Encoding
|
from tokenizers import Encoding
|
||||||
@ -192,17 +192,17 @@ class ExtractiveReader:
|
|||||||
start_candidates = start_candidates.cpu()
|
start_candidates = start_candidates.cpu()
|
||||||
end_candidates = end_candidates.cpu()
|
end_candidates = end_candidates.cpu()
|
||||||
|
|
||||||
start_candidates = [
|
start_candidates_char_indices = [
|
||||||
[encoding.token_to_chars(start)[0] for start in candidates]
|
[encoding.token_to_chars(start)[0] for start in candidates]
|
||||||
for candidates, encoding in zip(start_candidates, encodings)
|
for candidates, encoding in zip(start_candidates, encodings)
|
||||||
]
|
]
|
||||||
end_candidates = [
|
end_candidates_char_indices = [
|
||||||
[encoding.token_to_chars(end)[1] for end in candidates]
|
[encoding.token_to_chars(end)[1] for end in candidates]
|
||||||
for candidates, encoding in zip(end_candidates, encodings)
|
for candidates, encoding in zip(end_candidates, encodings)
|
||||||
]
|
]
|
||||||
probabilities = candidates.values.cpu()
|
probabilities = candidates.values.cpu()
|
||||||
|
|
||||||
return start_candidates, end_candidates, probabilities
|
return start_candidates_char_indices, end_candidates_char_indices, probabilities
|
||||||
|
|
||||||
def _nest_answers(
|
def _nest_answers(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -49,7 +49,7 @@ dependencies = [
|
|||||||
"requests",
|
"requests",
|
||||||
"httpx",
|
"httpx",
|
||||||
"pydantic<2",
|
"pydantic<2",
|
||||||
"transformers==4.32.1",
|
"transformers==4.34.1",
|
||||||
"pandas",
|
"pandas",
|
||||||
"rank_bm25",
|
"rank_bm25",
|
||||||
"scikit-learn>=1.3.0", # TF-IDF and metrics
|
"scikit-learn>=1.3.0", # TF-IDF and metrics
|
||||||
@ -62,7 +62,6 @@ dependencies = [
|
|||||||
"networkx", # graphs library
|
"networkx", # graphs library
|
||||||
"quantulum3", # quantities extraction from text
|
"quantulum3", # quantities extraction from text
|
||||||
"posthog", # telemetry
|
"posthog", # telemetry
|
||||||
# audio's espnet-model-zoo requires huggingface-hub version <0.8 while we need >=0.5 to be able to use create_repo in FARMReader
|
|
||||||
"tenacity", # retry decorator
|
"tenacity", # retry decorator
|
||||||
"sseclient-py", # server side events for OpenAI streaming
|
"sseclient-py", # server side events for OpenAI streaming
|
||||||
"more_itertools", # utilities
|
"more_itertools", # utilities
|
||||||
@ -102,7 +101,7 @@ preview = [
|
|||||||
"more-itertools", # TextDocumentSplitter
|
"more-itertools", # TextDocumentSplitter
|
||||||
]
|
]
|
||||||
inference = [
|
inference = [
|
||||||
"transformers[torch,sentencepiece]==4.32.1",
|
"transformers[torch,sentencepiece]==4.34.1",
|
||||||
"sentence-transformers>=2.2.0", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
|
"sentence-transformers>=2.2.0", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
|
||||||
"huggingface-hub>=0.5.0",
|
"huggingface-hub>=0.5.0",
|
||||||
]
|
]
|
||||||
@ -152,9 +151,6 @@ docstores = [
|
|||||||
docstores-gpu = [
|
docstores-gpu = [
|
||||||
"farm-haystack[elasticsearch,faiss-gpu,weaviate,pinecone,opensearch]",
|
"farm-haystack[elasticsearch,faiss-gpu,weaviate,pinecone,opensearch]",
|
||||||
]
|
]
|
||||||
audio = [
|
|
||||||
"openai-whisper"
|
|
||||||
]
|
|
||||||
aws = [
|
aws = [
|
||||||
"boto3",
|
"boto3",
|
||||||
# Costraint botocore to avoid taking to much time to resolve the dependency tree.
|
# Costraint botocore to avoid taking to much time to resolve the dependency tree.
|
||||||
|
|||||||
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Upgrade Transformers to the latest version 4.34.1.
|
||||||
|
This version adds support for the new Mistral, Persimmon, BROS, ViTMatte, and Nougat models.
|
||||||
@ -27,7 +27,7 @@ def test_basic_loading(pretrained_model_name_or_path, lm_class, monkeypatch):
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_basic_loading_unknown_model():
|
def test_basic_loading_unknown_model():
|
||||||
with pytest.raises(OSError):
|
with pytest.raises(RuntimeError):
|
||||||
get_language_model("model_that_doesnt_exist")
|
get_language_model("model_that_doesnt_exist")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -145,7 +145,7 @@ class TestLocalWhisperTranscriber:
|
|||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="ffmpeg not installed on Windows CI")
|
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="ffmpeg not installed on Windows CI")
|
||||||
def test_whisper_local_transcriber(self, preview_samples_path):
|
def test_whisper_local_transcriber(self, preview_samples_path):
|
||||||
comp = LocalWhisperTranscriber(model_name_or_path="medium")
|
comp = LocalWhisperTranscriber(model_name_or_path="medium", whisper_params={"language": "english"})
|
||||||
comp.warm_up()
|
comp.warm_up()
|
||||||
output = comp.run(
|
output = comp.run(
|
||||||
audio_files=[
|
audio_files=[
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user