-
Notifications
You must be signed in to change notification settings - Fork 0
/
embeddings.py
120 lines (101 loc) · 4.16 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Wrapper around OpenAI embedding models."""
from typing import Any, Dict, List, Optional
from langchain.embeddings.base import Embeddings
from langchain.utils import get_from_dict_or_env
from openai.error import APIConnectionError, APIError, RateLimitError, Timeout
from pydantic import BaseModel, Extra, root_validator
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
class OpenAIEmbeddings(BaseModel, Embeddings):
"""Wrapper around OpenAI embedding models.
To use, you should have the ``openai`` python package installed, and the
environment variable ``OPENAI_API_KEY`` set with your API key or pass it
as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain.embeddings import OpenAIEmbeddings
openai = OpenAIEmbeddings(openai_api_key="my-api-key")
"""
client: Any #: :meta private:
document_model_name: str = "text-embedding-ada-002"
query_model_name: str = "text-embedding-ada-002"
openai_api_key: Optional[str] = None
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
# TODO: deprecate this
@root_validator(pre=True, allow_reuse=True)
def get_model_names(cls, values: Dict) -> Dict:
"""Get model names from just old model name."""
if "model_name" in values:
if "document_model_name" in values:
raise ValueError(
"Both `model_name` and `document_model_name` were provided, "
"but only one should be."
)
if "query_model_name" in values:
raise ValueError(
"Both `model_name` and `query_model_name` were provided, "
"but only one should be."
)
model_name = values.pop("model_name")
values["document_model_name"] = f"text-search-{model_name}-doc-001"
values["query_model_name"] = f"text-search-{model_name}-query-001"
return values
@root_validator(allow_reuse=True)
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
openai_api_key = get_from_dict_or_env(
values, "openai_api_key", "OPENAI_API_KEY"
)
try:
import openai
openai.api_key = openai_api_key
values["client"] = openai.Embedding
except ImportError:
raise ValueError(
"Could not import openai python package. "
"Please it install it with `pip install openai`."
)
return values
@retry(
reraise=True,
stop=stop_after_attempt(100),
wait=wait_exponential(multiplier=1, min=10, max=60),
retry=(
retry_if_exception_type(Timeout)
| retry_if_exception_type(APIError)
| retry_if_exception_type(APIConnectionError)
| retry_if_exception_type(RateLimitError)
),
)
def _embedding_func(self, text: str, *, engine: str) -> List[float]:
"""Call out to OpenAI's embedding endpoint with exponential backoff."""
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
return self.client.create(input=[text], engine=engine)["data"][0]["embedding"]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Call out to OpenAI's embedding endpoint for embedding search docs.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
responses = [
self._embedding_func(text, engine=self.document_model_name)
for text in texts
]
return responses
def embed_query(self, text: str) -> List[float]:
"""Call out to OpenAI's embedding endpoint for embedding query text.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
embedding = self._embedding_func(text, engine=self.query_model_name)
return embedding