Spaces:

yahya-darman
/

AgenticStockAdvisor

Sleeping

AgenticStockAdvisor / modal_llm_service.py

Yahya Darman

Initial clean commit on master branch

82cfa24 12 months ago

3.21 kB

	import modal

	# Define the container image with vLLM
	vllm_image = (
	modal.Image.debian_slim(python_version="3.12")
	.pip_install(
	"vllm==0.7.2",
	"huggingface_hub[hf_transfer]==0.26.2",
	"flashinfer-python==0.2.0.post2",
	extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	)

	# Enable vLLM V1 engine for better performance
	vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})

	# Model configuration
	MODELS_DIR = "/llamas"
	MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
	MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"

	# Set up caching volumes
	hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
	vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)

	# Create Modal app
	app = modal.App("buffetbot-llm-service")

	# Configuration
	N_GPU = 1 # Number of GPUs to use
	API_KEY = "buffetbot-llm-key" # We'll replace this with a Modal Secret
	MINUTES = 60
	VLLM_PORT = 8000

	@app.function(
	image=vllm_image,
	gpu=f"H100:{N_GPU}",
	scaledown_window=15 * MINUTES,
	timeout=10 * MINUTES,
	volumes={
	"/root/.cache/huggingface": hf_cache_vol,
	"/root/.cache/vllm": vllm_cache_vol,
	},
	secrets=[modal.Secret.from_name("buffetbot-vllm-config")]
	)
	@modal.concurrent(max_inputs=100)
	@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
	def serve():
	import subprocess
	import os

	cmd = [
	"vllm",
	"serve",
	"--uvicorn-log-level=info",
	MODEL_NAME,
	"--revision",
	MODEL_REVISION,
	"--host",
	"0.0.0.0",
	"--port",
	str(VLLM_PORT),
	"--api-key",
	os.environ["API_KEY"],
	]

	subprocess.Popen(" ".join(cmd), shell=True)

	@app.local_entrypoint()
	def test(test_timeout=10 * MINUTES):
	import json
	import time
	import urllib.request

	print(f"Running health check for server at {serve.get_web_url()}")
	up, start, delay = False, time.time(), 10
	while not up:
	try:
	with urllib.request.urlopen(serve.get_web_url() + "/health") as response:
	if response.getcode() == 200:
	up = True
	except Exception:
	if time.time() - start > test_timeout:
	break
	time.sleep(delay)

	assert up, f"Failed health check for server at {serve.get_web_url()}"
	print(f"Successful health check for server at {serve.get_web_url()}")

	# Test with a sample prompt
	messages = [{"role": "user", "content": "What is the economic moat of Apple Inc.?"}]
	print(f"Sending a sample message to {serve.get_web_url()}", *messages, sep="\n")

	headers = {
	"Authorization": f"Bearer {API_KEY}",
	"Content-Type": "application/json",
	}
	payload = json.dumps({"messages": messages, "model": MODEL_NAME})
	req = urllib.request.Request(
	serve.get_web_url() + "/v1/chat/completions",
	data=payload.encode("utf-8"),
	headers=headers,
	method="POST",
	)
	with urllib.request.urlopen(req) as response:
	print(json.loads(response.read().decode()))