You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Create an inference endpoint with a TEI container for reranking with a reranking model:
fromhuggingface_hubimportcreate_inference_endpointrepository="BAAI/bge-reranker-base"#"BAAI/bge-reranker-large-base"endpoint_name="bge-reranker-large-base-05"namespace="MoritzLaurer"# your user or organization name# check if endpoint with this name already exists from previous testsavailable_endpoints_names= [endpoint.nameforendpointinhuggingface_hub.list_inference_endpoints()]
ifendpoint_nameinavailable_endpoints_names:
endpoint_exists=Trueelse:
endpoint_exists=Falseprint("Does the endpoint already exist?", endpoint_exists)
# create new endpointifnotendpoint_exists:
endpoint=create_inference_endpoint(
endpoint_name,
repository=repository,
namespace=namespace,
framework="pytorch",
task="sentence-ranking",
# see the available hardware options here: https://huggingface.co/docs/inference-endpoints/pricing#pricingaccelerator="gpu",
vendor="aws",
region="us-east-1",
instance_size="x1",
instance_type="nvidia-a10g",
min_replica=0,
max_replica=1,
type="protected",
custom_image={
"health_route":"/health",
"env": {
"MAX_BATCH_TOKENS":"16384",
"MAX_CONCURRENT_REQUESTS":"512",
"MAX_BATCH_REQUESTS": "160",
"MODEL_ID":"/repository"
},
"url":"ghcr.io/huggingface/text-embeddings-inference:latest"
},
)
print("Waiting for endpoint to be created")
endpoint.wait()
print("Endpoint ready")
# if endpoint with this name already exists, get existing endpointelse:
endpoint=huggingface_hub.get_inference_endpoint(name=endpoint_name, namespace=namespace)
ifendpoint.statusin ["paused", "scaledToZero"]:
print("Resuming endpoint")
endpoint.resume()
print("Waiting for endpoint to start")
endpoint.wait()
print("Endpoint ready")
Send request both with /rerank path appended to endpoint.url or without:
importrequestsHEADERS= {"Authorization": f"Bearer {huggingface_hub.get_token()}"}
API_URL=endpoint.url+"/rerank"# function for standard http requestsdefquery(payload=None, api_url=None):
response=requests.post(api_url, headers=HEADERS, json=payload)
returnresponse.json()
output=query(
payload= {
"query":"What is Deep Learning?",
"texts": ["Deep Learning is not...", "Deep learning is...", "testtest"]
},
api_url=API_URL
)
print(output)
In both cases the it get the same and correct reranking output.
On the other hand, when I create this endpoint for sentence-similarity with an embedding model:
fromhuggingface_hubimportcreate_inference_endpointrepository="thenlper/gte-large"#"BAAI/bge-reranker-large-base"endpoint_name="gte-large-001"namespace="MoritzLaurer"# your user or organization name# check if endpoint with this name already exists from previous testsavailable_endpoints_names= [endpoint.nameforendpointinhuggingface_hub.list_inference_endpoints()]
ifendpoint_nameinavailable_endpoints_names:
endpoint_exists=Trueelse:
endpoint_exists=Falseprint("Does the endpoint already exist?", endpoint_exists)
# create new endpointifnotendpoint_exists:
endpoint=create_inference_endpoint(
endpoint_name,
repository=repository,
namespace=namespace,
framework="pytorch",
task="sentence-similarity",
# see the available hardware options here: https://huggingface.co/docs/inference-endpoints/pricing#pricingaccelerator="gpu",
vendor="aws",
region="us-east-1",
instance_size="x1",
instance_type="nvidia-a10g",
min_replica=2,
max_replica=4,
type="protected",
custom_image={
"health_route":"/health",
"env": {
"MAX_BATCH_TOKENS":"16384",
"MAX_CONCURRENT_REQUESTS":"512",
"MAX_BATCH_REQUESTS": "124",
"MODEL_ID": "/repository"},
"url":"ghcr.io/huggingface/text-embeddings-inference:latest"
}
)
print("Waiting for endpoint to be created")
endpoint.wait()
print("Endpoint ready")
# if endpoint with this name already exists, get existing endpointelse:
endpoint=huggingface_hub.get_inference_endpoint(name=endpoint_name, namespace=namespace)
ifendpoint.statusin ["paused", "scaledToZero"]:
print("Resuming endpoint")
endpoint.resume()
print("Waiting for endpoint to start")
endpoint.wait()
print("Endpoint ready")
Then I need to append the /similarity route at the end of the URL get correct outputs.
importrequestsAPI_URL=endpoint.url+"/similarity"#"https://c5hhcabur7dqwyj7.us-east-1.aws.endpoints.huggingface.cloud" + "/similarity"headers= {
"Accept" : "application/json",
"Authorization": f"Bearer {huggingface_hub.get_token()}",
"Content-Type": "application/json"
}
defquery(payload):
response=requests.post(API_URL, headers=headers, json=payload)
returnresponse.json()
output=query({
"inputs": {"sentences": [
"That is a happy dog",
"That is a very happy person",
"Today is a sunny day"
],
"source_sentence": "That is a happy person",
"parameters": {}}
})
output#[0.91960955, 0.98106885, 0.8241128]
If I don't manually append /similarity to the URL, I get the following error:
---------------------------------------------------------------------------JSONDecodeErrorTraceback (mostrecentcalllast)
File~/miniconda/lib/python3.9/site-packages/requests/models.py:974, inResponse.json(self, **kwargs)
973try:
-->974returncomplexjson.loads(self.text, **kwargs)
975exceptJSONDecodeErrorase:
976# Catch JSON-related errors and raise as requests.JSONDecodeError977# This aliases json.JSONDecodeError and simplejson.JSONDecodeErrorFile~/miniconda/lib/python3.9/json/__init__.py:346, inloads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
343if (clsisNoneandobject_hookisNoneand344parse_intisNoneandparse_floatisNoneand345parse_constantisNoneandobject_pairs_hookisNoneandnotkw):
-->346return_default_decoder.decode(s)
347ifclsisNone:
File~/miniconda/lib/python3.9/json/decoder.py:337, inJSONDecoder.decode(self, s, _w)
333"""Return the Python representation of ``s`` (a ``str`` instance 334 containing a JSON document). 335 336 """-->337obj, end=self.raw_decode(s, idx=_w(s, 0).end())
338end=_w(s, end).end()
File~/miniconda/lib/python3.9/json/decoder.py:355, inJSONDecoder.raw_decode(self, s, idx)
354exceptStopIterationaserr:
-->355raiseJSONDecodeError("Expecting value", s, err.value) fromNone356returnobj, endJSONDecodeError: Expectingvalue: line1column1 (char0)
Duringhandlingoftheaboveexception, anotherexceptionoccurred:
JSONDecodeErrorTraceback (mostrecentcalllast)
CellIn[17], line1411response=requests.post(API_URL, headers=headers, json=payload)
12returnresponse.json()
--->14output=query({
15"inputs": {"sentences": [
16"That is a happy dog",
17"That is a very happy person",
18"Today is a sunny day"19 ],
20"source_sentence": "That is a happy person",
21"parameters": {}}
22 })
24outputCellIn[17], line12, inquery(payload)
10defquery(payload):
11response=requests.post(API_URL, headers=headers, json=payload)
--->12returnresponse.json()
File~/miniconda/lib/python3.9/site-packages/requests/models.py:978, inResponse.json(self, **kwargs)
974returncomplexjson.loads(self.text, **kwargs)
975exceptJSONDecodeErrorase:
976# Catch JSON-related errors and raise as requests.JSONDecodeError977# This aliases json.JSONDecodeError and simplejson.JSONDecodeError-->978raiseRequestsJSONDecodeError(e.msg, e.doc, e.pos)
JSONDecodeError: Expectingvalue: line1column1 (char0)
Expected behavior
Either consistently force to append the correct path, or not.
System Info
Inference endpoints
TEI version 1.5
Information
Tasks
Reproduction
Create an inference endpoint with a TEI container for reranking with a reranking model:
Send request both with
/rerank
path appended to endpoint.url or without:In both cases the it get the same and correct reranking output.
On the other hand, when I create this endpoint for sentence-similarity with an embedding model:
Then I need to append the
/similarity
route at the end of the URL get correct outputs.If I don't manually append
/similarity
to the URL, I get the following error:Expected behavior
Either consistently force to append the correct path, or not.
See this internal thread for context
The text was updated successfully, but these errors were encountered: