Spaces:

Memverge
/

MemMachine-Playground

Running

App Files Files Community

Anirudh Esthuri commited on Nov 19

Commit

4a34f6e

1 Parent(s): d349f76

Add inference profile ARNs for provisioned throughput Claude models

Browse files

Files changed (2) hide show

llm.py +31 -13
model_config.py +7 -0

llm.py CHANGED Viewed

@@ -6,7 +6,7 @@ import boto3
 import openai
 import requests
 from dotenv import load_dotenv
-from model_config import MODEL_TO_PROVIDER
 # ──────────────────────────────────────────────────────────────
 # Load environment variables
@@ -119,11 +119,13 @@ def chat(messages, persona):
         try:
             bedrock_runtime = get_bedrock_client()
-            response = bedrock_runtime.invoke_model(
-                modelId=MODEL_STRING,
-                contentType="application/json",
-                accept="application/json",
-                body=json.dumps(
                     {
                         "anthropic_version": "bedrock-2023-05-31",
                         "system": system_prompt,
@@ -132,7 +134,15 @@ def chat(messages, persona):
                         "temperature": 0.3,  # Lower temperature for more focused responses
                     }
                 ),
-            )
             dt = time.time() - t0
             body = json.loads(response["body"].read())
@@ -374,17 +384,25 @@ def check_credentials():
         try:
             bedrock_runtime = get_bedrock_client()
             # Try a simple test invocation to verify credentials
-            test_response = bedrock_runtime.invoke_model(
-                modelId="anthropic.claude-haiku-4-5-20251001-v1:0",
-                contentType="application/json",
-                accept="application/json",
-                body=json.dumps({
                     "anthropic_version": "bedrock-2023-05-31",
                     "messages": [{"role": "user", "content": "test"}],
                     "max_tokens": 10,
                     "temperature": 0.1
                 })
-            )
             print("Bedrock connection successful")
             return True
         except Exception as e:

 import openai
 import requests
 from dotenv import load_dotenv
+from model_config import MODEL_TO_PROVIDER, MODEL_TO_INFERENCE_PROFILE_ARN
 # ──────────────────────────────────────────────────────────────
 # Load environment variables
         try:
             bedrock_runtime = get_bedrock_client()
+            # Use inference profile ARN if available (for provisioned throughput models)
+            # Otherwise use modelId (for on-demand models)
+            invoke_kwargs = {
+                "contentType": "application/json",
+                "accept": "application/json",
+                "body": json.dumps(
                     {
                         "anthropic_version": "bedrock-2023-05-31",
                         "system": system_prompt,
                         "temperature": 0.3,  # Lower temperature for more focused responses
                     }
                 ),
+            }
+            # Check if this model has an inference profile ARN (provisioned throughput)
+            if MODEL_STRING in MODEL_TO_INFERENCE_PROFILE_ARN:
+                invoke_kwargs["inferenceProfileIdentifier"] = MODEL_TO_INFERENCE_PROFILE_ARN[MODEL_STRING]
+            else:
+                invoke_kwargs["modelId"] = MODEL_STRING
+            response = bedrock_runtime.invoke_model(**invoke_kwargs)
             dt = time.time() - t0
             body = json.loads(response["body"].read())
         try:
             bedrock_runtime = get_bedrock_client()
             # Try a simple test invocation to verify credentials
+            test_model = "anthropic.claude-haiku-4-5-20251001-v1:0"
+            test_kwargs = {
+                "contentType": "application/json",
+                "accept": "application/json",
+                "body": json.dumps({
                     "anthropic_version": "bedrock-2023-05-31",
                     "messages": [{"role": "user", "content": "test"}],
                     "max_tokens": 10,
                     "temperature": 0.1
                 })
+            }
+            # Use inference profile ARN if available
+            if test_model in MODEL_TO_INFERENCE_PROFILE_ARN:
+                test_kwargs["inferenceProfileIdentifier"] = MODEL_TO_INFERENCE_PROFILE_ARN[test_model]
+            else:
+                test_kwargs["modelId"] = test_model
+            test_response = bedrock_runtime.invoke_model(**test_kwargs)
             print("Bedrock connection successful")
             return True
         except Exception as e:

model_config.py CHANGED Viewed

@@ -31,3 +31,10 @@ MODEL_DISPLAY_NAMES = {
 }
 MODEL_CHOICES = [model for models in PROVIDER_MODEL_MAP.values() for model in models]

 }
 MODEL_CHOICES = [model for models in PROVIDER_MODEL_MAP.values() for model in models]
+# Inference profile ARNs for provisioned throughput models
+MODEL_TO_INFERENCE_PROFILE_ARN = {
+    "anthropic.claude-haiku-4-5-20251001-v1:0": "arn:aws:bedrock:us-east-1:850995563530:inference-profile/global.anthropic.claude-haiku-4-5-20251001-v1:0",
+    "anthropic.claude-sonnet-4-5-20250929-v1:0": "arn:aws:bedrock:us-east-1:850995563530:inference-profile/global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+    "anthropic.claude-opus-4-20250514-v1:0": "arn:aws:bedrock:us-east-1:850995563530:inference-profile/global.anthropic.claude-sonnet-4-20250514-v1:0",
+}