feat: add vertexai embeddings (#2693)

This PR:
- Adds VertexAI embeddings as an embedding provider

Testing
- Tested with pinecone destination connector on
[this](https://github.com/Unstructured-IO/unstructured/actions/runs/8429035114/job/23082700074?pr=2693)
job run.

---------

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
Ahmet Melek 2024-03-28 21:15:36 +00:00 committed by GitHub
parent 887e6c9094
commit d46792214a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 24484 additions and 4 deletions

View File

@ -1,11 +1,12 @@
## 0.13.0-dev13
## 0.13.0-dev14
### Enhancements
### Enhancements
* **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks.
* **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores
* **Add `.metadata.orig_elements` to chunks.** `.metadata.orig_elements: list[Element]` is added to chunks during the chunking process (when requested) to allow access to information from the elements each chunk was formed from. This is useful for example to recover metadata fields that cannot be consolidated to a single value for a chunk, like `page_number`, `coordinates`, and `image_base64`.
* **Add `--include_orig_elements` option to Ingest CLI.** By default, when chunking, the original elements used to form each chunk are added to `chunk.metadata.orig_elements` for each chunk. * The `include_orig_elements` parameter allows the user to turn off this behavior to produce a smaller payload when they don't need this metadata.
* **Add Google VertexAI embedder** Adds VertexAI embeddings to support embedding via Google Vertex AI.
### Features

View File

@ -171,6 +171,59 @@ To obtain an api key, visit: https://octo.ai/docs/getting-started/how-to-create-
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
[print(e.embeddings, e) for e in elements]
print(query_embedding, query)
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
``VertexAIEmbeddingEncoder``
--------------------------
The ``VertexAIEmbeddingEncoder`` class connects to the GCP VertexAI to obtain embeddings for pieces of text.
``embed_documents`` will receive a list of Elements, and return an updated list which
includes the ``embeddings`` attribute for each Element.
``embed_query`` will receive a query as a string, and return a list of floats which is the
embedding vector for the given query string.
``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any
embedding vector obtained via this class.
``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via
this class are unit vectors.
The following code block shows an example of how to use ``VertexAIEmbeddingEncoder``. You will
see the updated elements list (with the ``embeddings`` attribute included for each element),
the embedding vector for the query string, and some metadata properties about the embedding model.
To use Vertex AI PaLM tou will need to:
- either, pass the full json content of your GCP VertexAI application credentials to the
VertexAIEmbeddingConfig as the api_key parameter. (This will create a file in the ``/tmp``
directory with the content of the json, and set the GOOGLE_APPLICATION_CREDENTIALS environment
variable to the **path** of the created file.)
- or, you'll need to store the path to a manually created service account JSON file as the
GOOGLE_APPLICATION_CREDENTIALS environment variable. (For more information:
https://python.langchain.com/docs/integrations/text_embedding/google_vertex_ai_palm)
- or, you'll need to have the credentials configured for your environment (gcloud,
workload identity, etc…)
.. code:: python
import os
from unstructured.documents.elements import Text
from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
embedding_encoder = VertexAIEmbeddingEncoder(
config=VertexAIEmbeddingConfig(api_key=os.environ["VERTEXAI_GCP_APP_CREDS_JSON_CONTENT"])
)
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
[print(e.embeddings, e) for e in elements]
print(query_embedding, query)
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())

View File

@ -0,0 +1,30 @@
import os
from unstructured.documents.elements import Text
from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
# To use Vertex AI PaLM tou will need to:
# - either, pass the full json content of your GCP VertexAI application credentials to the
# VertexAIEmbeddingConfig as the api_key parameter. (This will create a file in the ``/tmp``
# directory with the content of the json, and set the GOOGLE_APPLICATION_CREDENTIALS environment
# variable to the **path** of the created file.)
# - or, you'll need to store the path to a manually created service account JSON file as the
# GOOGLE_APPLICATION_CREDENTIALS environment variable. (For more information:
# https://python.langchain.com/docs/integrations/text_embedding/google_vertex_ai_palm)
# - or, you'll need to have the credentials configured for your environment (gcloud,
# workload identity, etc…)
embedding_encoder = VertexAIEmbeddingEncoder(
config=VertexAIEmbeddingConfig(api_key=os.environ["VERTEXAI_GCP_APP_CREDS_JSON_CONTENT"])
)
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
[print(e.embeddings, e) for e in elements]
print(query_embedding, query)
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())

View File

@ -0,0 +1,4 @@
-c ../constraints.in
-c ../base.txt
openai
tiktoken

View File

@ -0,0 +1,72 @@
#
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile --output-file=ingest/embed-octoai.txt ingest/embed-octoai.in
#
anyio==3.7.1
# via
# -c ingest/../constraints.in
# httpx
# openai
certifi==2024.2.2
# via
# -c ingest/../base.txt
# -c ingest/../constraints.in
# httpcore
# httpx
# requests
charset-normalizer==3.3.2
# via
# -c ingest/../base.txt
# requests
distro==1.9.0
# via openai
exceptiongroup==1.2.0
# via anyio
h11==0.14.0
# via httpcore
httpcore==1.0.4
# via httpx
httpx==0.27.0
# via openai
idna==3.6
# via
# -c ingest/../base.txt
# anyio
# httpx
# requests
openai==1.14.3
# via -r ingest/embed-octoai.in
pydantic==1.10.14
# via
# -c ingest/../constraints.in
# openai
regex==2023.12.25
# via
# -c ingest/../base.txt
# tiktoken
requests==2.31.0
# via
# -c ingest/../base.txt
# tiktoken
sniffio==1.3.1
# via
# anyio
# httpx
# openai
tiktoken==0.6.0
# via -r ingest/embed-octoai.in
tqdm==4.66.2
# via
# -c ingest/../base.txt
# openai
typing-extensions==4.10.0
# via
# -c ingest/../base.txt
# openai
# pydantic
urllib3==2.2.1
# via
# -c ingest/../base.txt
# requests

View File

@ -0,0 +1,5 @@
-c ../constraints.in
-c ../base.txt
langchain
langchain-community
langchain-google-vertexai

View File

@ -0,0 +1,243 @@
#
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile --output-file=ingest/embed-vertexai.txt ingest/embed-vertexai.in
#
aiohttp==3.9.3
# via
# langchain
# langchain-community
aiosignal==1.3.1
# via aiohttp
anyio==3.7.1
# via
# -c ingest/../constraints.in
# langchain-core
async-timeout==4.0.3
# via
# aiohttp
# langchain
attrs==23.2.0
# via aiohttp
cachetools==5.3.3
# via google-auth
certifi==2024.2.2
# via
# -c ingest/../base.txt
# -c ingest/../constraints.in
# requests
charset-normalizer==3.3.2
# via
# -c ingest/../base.txt
# requests
dataclasses-json==0.6.4
# via
# -c ingest/../base.txt
# langchain
# langchain-community
exceptiongroup==1.2.0
# via anyio
frozenlist==1.4.1
# via
# aiohttp
# aiosignal
google-api-core[grpc]==2.18.0
# via
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
google-auth==2.29.0
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
google-cloud-aiplatform==1.44.0
# via langchain-google-vertexai
google-cloud-bigquery==3.19.0
# via google-cloud-aiplatform
google-cloud-core==2.4.1
# via
# google-cloud-bigquery
# google-cloud-storage
google-cloud-resource-manager==1.12.3
# via google-cloud-aiplatform
google-cloud-storage==2.16.0
# via
# google-cloud-aiplatform
# langchain-google-vertexai
google-crc32c==1.5.0
# via
# google-cloud-storage
# google-resumable-media
google-resumable-media==2.7.0
# via
# google-cloud-bigquery
# google-cloud-storage
googleapis-common-protos[grpc]==1.63.0
# via
# google-api-core
# grpc-google-iam-v1
# grpcio-status
grpc-google-iam-v1==0.13.0
# via google-cloud-resource-manager
grpcio==1.62.1
# via
# google-api-core
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
grpcio-status==1.62.1
# via google-api-core
idna==3.6
# via
# -c ingest/../base.txt
# anyio
# requests
# yarl
jsonpatch==1.33
# via
# langchain
# langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.1.13
# via -r ingest/embed-vertexai.in
langchain-community==0.0.29
# via
# -r ingest/embed-vertexai.in
# langchain
langchain-core==0.1.33
# via
# langchain
# langchain-community
# langchain-google-vertexai
# langchain-text-splitters
langchain-google-vertexai==0.1.1
# via -r ingest/embed-vertexai.in
langchain-text-splitters==0.0.1
# via langchain
langsmith==0.1.31
# via
# langchain
# langchain-community
# langchain-core
marshmallow==3.21.1
# via
# -c ingest/../base.txt
# dataclasses-json
multidict==6.0.5
# via
# aiohttp
# yarl
mypy-extensions==1.0.0
# via
# -c ingest/../base.txt
# typing-inspect
numpy==1.26.4
# via
# -c ingest/../base.txt
# langchain
# langchain-community
# shapely
orjson==3.9.15
# via langsmith
packaging==23.2
# via
# -c ingest/../base.txt
# -c ingest/../constraints.in
# google-cloud-aiplatform
# google-cloud-bigquery
# langchain-core
# marshmallow
proto-plus==1.23.0
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
protobuf==4.23.4
# via
# -c ingest/../constraints.in
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
# proto-plus
pyasn1==0.5.1
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.3.0
# via google-auth
pydantic==1.10.14
# via
# -c ingest/../constraints.in
# langchain
# langchain-core
# langsmith
python-dateutil==2.9.0.post0
# via
# -c ingest/../base.txt
# google-cloud-bigquery
pyyaml==6.0.1
# via
# langchain
# langchain-community
# langchain-core
requests==2.31.0
# via
# -c ingest/../base.txt
# google-api-core
# google-cloud-bigquery
# google-cloud-storage
# langchain
# langchain-community
# langchain-core
# langsmith
rsa==4.9
# via google-auth
shapely==2.0.3
# via google-cloud-aiplatform
six==1.16.0
# via
# -c ingest/../base.txt
# python-dateutil
sniffio==1.3.1
# via anyio
sqlalchemy==2.0.29
# via
# langchain
# langchain-community
tenacity==8.2.3
# via
# langchain
# langchain-community
# langchain-core
types-protobuf==4.24.0.20240311
# via langchain-google-vertexai
types-requests==2.31.0.20240311
# via langchain-google-vertexai
typing-extensions==4.10.0
# via
# -c ingest/../base.txt
# pydantic
# sqlalchemy
# typing-inspect
typing-inspect==0.9.0
# via
# -c ingest/../base.txt
# dataclasses-json
urllib3==2.2.1
# via
# -c ingest/../base.txt
# requests
# types-requests
yarl==1.9.4
# via aiohttp

View File

@ -169,6 +169,8 @@ setup(
"local-inference": all_doc_reqs,
"paddleocr": load_requirements("requirements/extra-paddleocr.txt"),
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.txt"),
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.txt"),
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.txt"),
"openai": load_requirements("requirements/ingest/embed-openai.txt"),
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.txt"),
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.txt"),

View File

@ -0,0 +1,19 @@
from unstructured.documents.elements import Text
from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
def test_embed_documents_does_not_break_element_to_dict(mocker):
# Mocked client with the desired behavior for embed_documents
mock_client = mocker.MagicMock()
mock_client.embed_documents.return_value = [1, 2]
# Mock create_client to return our mock_client
mocker.patch.object(VertexAIEmbeddingEncoder, "create_client", return_value=mock_client)
encoder = VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(api_key="api_key"))
elements = encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
assert len(elements) == 2
assert elements[0].to_dict()["text"] == "This is sentence 1"
assert elements[1].to_dict()["text"] == "This is sentence 2"

View File

@ -0,0 +1,40 @@
#!/usr/bin/env bash
set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=embed-octoai
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
OCTOAI_API_KEY=${OCTOAI_API_KEY:-$OCTOAI_API_KEY}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
}
trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--verbose \
--reprocess \
--input-path example-docs/book-war-and-peace-1p.txt \
--work-dir "$WORK_DIR" \
--embedding-provider "octoai" \
--embedding-api-key "$OCTOAI_API_KEY"
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -0,0 +1,41 @@
#!/usr/bin/env bash
set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=embed-vertexai
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
GCP_INGEST_SERVICE_KEY=${GCP_INGEST_SERVICE_KEY:-$GCP_INGEST_SERVICE_KEY}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
}
trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--verbose \
--reprocess \
--input-path example-docs/book-war-and-peace-1p.txt \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-vertexai" \
--embedding-api-key "$GCP_INGEST_SERVICE_KEY" \
--embedding-model-name "textembedding-gecko@001"
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -57,6 +57,8 @@ all_tests=(
'hubspot.sh'
'local-embed.sh'
'local-embed-bedrock.sh'
'local-embed-octoai.sh'
'local-embed-vertexai.sh'
'sftp.sh'
'mongodb.sh'
'opensearch.sh'

View File

@ -1 +1 @@
__version__ = "0.13.0-dev13" # pragma: no cover
__version__ = "0.13.0-dev14" # pragma: no cover

View File

@ -2,10 +2,12 @@ from unstructured.embed.bedrock import BedrockEmbeddingEncoder
from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
from unstructured.embed.openai import OpenAIEmbeddingEncoder
from unstructured.embed.vertexai import VertexAIEmbeddingEncoder
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
"langchain-openai": OpenAIEmbeddingEncoder,
"langchain-huggingface": HuggingFaceEmbeddingEncoder,
"langchain-aws-bedrock": BedrockEmbeddingEncoder,
"langchain-vertexai": VertexAIEmbeddingEncoder,
"octoai": OctoAIEmbeddingEncoder,
}

View File

@ -70,7 +70,7 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
@EmbeddingEncoderConnectionError.wrap
@requires_dependencies(
["openai", "tiktoken"],
extras="embed-openai",
extras="embed-octoai",
)
def create_client(self) -> "OpenAI":
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""

View File

@ -0,0 +1,90 @@
# type: ignore
import json
import os
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional
import numpy as np
from unstructured.documents.elements import (
Element,
)
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
from unstructured.ingest.error import EmbeddingEncoderConnectionError
from unstructured.utils import FileHandler, requires_dependencies
if TYPE_CHECKING:
from langchain_google_vertexai import VertexAIEmbeddings
@dataclass
class VertexAIEmbeddingConfig(EmbeddingConfig):
api_key: str
model_name: Optional[str] = "textembedding-gecko@001"
@dataclass
class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
config: VertexAIEmbeddingConfig
_client: Optional["VertexAIEmbeddings"] = field(init=False, default=None)
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
@property
def client(self) -> "VertexAIEmbeddings":
if self._client is None:
self._client = self.create_client()
return self._client
@property
def exemplary_embedding(self) -> List[float]:
if self._exemplary_embedding is None:
self._exemplary_embedding = self.client.embed_query("A sample query.")
return self._exemplary_embedding
def initialize(self):
pass
def num_of_dimensions(self):
return np.shape(self.exemplary_embedding)
def is_unit_vector(self):
return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
def embed_query(self, query):
result = self.client.embed_query(str(query))
return result
def embed_documents(self, elements: List[Element]) -> List[Element]:
embeddings = self.client.embed_documents([str(e) for e in elements])
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
return elements_with_embeddings
def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
assert len(elements) == len(embeddings)
elements_w_embedding = []
for i, element in enumerate(elements):
element.embeddings = embeddings[i]
elements_w_embedding.append(element)
return elements
@property
def application_credentials_path(self):
return os.path.join("/tmp", "google-vertex-app-credentials.json")
def register_application_credentials(self):
credentials_file = FileHandler(self.application_credentials_path)
credentials_file.write_file(json.dumps(json.loads(self.config.api_key)))
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.application_credentials_path
@EmbeddingEncoderConnectionError.wrap
@requires_dependencies(
["langchain", "langchain_google_vertexai"],
extras="embed-vertexai",
)
def create_client(self) -> "VertexAIEmbeddings":
"""Creates a Langchain VertexAI python client to embed elements."""
from langchain_google_vertexai import VertexAIEmbeddings
self.register_application_credentials()
vertexai_client = VertexAIEmbeddings(model_name=self.config.model_name)
return vertexai_client

View File

@ -222,6 +222,13 @@ class EmbeddingConfig(BaseConfig):
region_name=self.aws_region,
)
)
elif self.provider == "langchain-vertexai":
from unstructured.embed.vertexai import (
VertexAIEmbeddingConfig,
VertexAIEmbeddingEncoder,
)
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
else:
raise ValueError(f"{self.provider} not a recognized encoder")

View File

@ -7,6 +7,7 @@ import json
import os
import platform
import subprocess
import threading
from datetime import datetime
from functools import wraps
from itertools import combinations
@ -773,3 +774,25 @@ def catch_overlapping_and_nested_bboxes(
document_with_overlapping_flag = True
return document_with_overlapping_flag, overlapping_cases
class FileHandler:
def __init__(self, file_path: str):
self.file_path = file_path
self.lock = threading.Lock()
def read_file(self):
with self.lock:
with open(self.file_path) as file:
data = file.read()
return data
def write_file(self, data: str) -> None:
with self.lock:
with open(self.file_path, "w") as file:
file.write(data)
def cleanup_file(self):
with self.lock:
if os.path.exists(self.file_path):
os.remove(self.file_path)