Spaces:

lainlives
/

ztestzz

Paused

App Files Files Community

lainlives commited on Mar 8

Commit

7d36c43

verified ·

1 Parent(s): 2d10560

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Dockerfile +7 -1
README.md +0 -2
app.py +9 -10
start.sh +3 -15

Dockerfile CHANGED Viewed

@@ -1,5 +1,10 @@
 FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y
 RUN apt-get update && \
     apt-get upgrade -y
@@ -72,7 +77,8 @@ RUN pip install git+https://github.com/huggingface/transformers.git --break-syst
 ENV PYTHONPATH=${HOME}/app \
     PYTHONUNBUFFERED=1 \

 FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04
 ENV DEBIAN_FRONTEND=noninteractive
+ARG HF_TOKEN
+ENV HF_TOKEN=$HF_TOKEN
 RUN apt-get update && apt-get install -y
 RUN apt-get update && \
     apt-get upgrade -y
+RUN hf download lainlives/llama.cpp --local-dir /usr/bin/ --token $HF_TOKEN
+RUN chmod +x /usr/bin/llama-*
 ENV PYTHONPATH=${HOME}/app \
     PYTHONUNBUFFERED=1 \

README.md CHANGED Viewed

@@ -8,5 +8,3 @@ pinned: false
 suggested_hardware: "a10g-large"
 disable_embedding: true
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 suggested_hardware: "a10g-large"
 disable_embedding: true
 ---

app.py CHANGED Viewed

@@ -63,19 +63,14 @@ def push_to_ollama(gguf_path, ollama_repo, tag_suffix):
     logs.append(format_log(f"🐳 Creating Ollama build: {ollama_tag}"))
     try:
-        # 2. Run 'ollama create' via CLI
-        # This replaces ollama.create(...)
         create_cmd = ["ollama", "create", ollama_tag, "-f", str(modelfile_path)]
         subprocess.run(create_cmd, check=True, capture_output=True)
-        # Clean up the temporary Modelfile
         if modelfile_path.exists():
             os.remove(modelfile_path)
         logs.append(format_log(f"⬆️ Pushing to registry: {ollama_tag}..."))
-        # 3. Run 'ollama push' via CLI
-        # This replaces ollama.push(...)
         push_cmd = ["ollama", "push", ollama_tag]
         push_result = subprocess.run(push_cmd, capture_output=True, text=True)
@@ -84,7 +79,7 @@ def push_to_ollama(gguf_path, ollama_repo, tag_suffix):
         else:
             logs.append(format_log(f"❌ Push failed: {push_result.stderr}"))
-        # Optional: Remove the local tag to save disk space in the container
         subprocess.run(["ollama", "rm", ollama_tag], stdout=subprocess.DEVNULL)
     except subprocess.CalledProcessError as e:
@@ -111,13 +106,16 @@ def start_ollama_daemon(ollama_key):
     process = subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env)
     pid = process.pid
     logs.append(format_log("⏳ Starting Ollama daemon in background..."))
-    return pid
 def stop_ollama_daemon(pid):
     print("⏳ Stopping Ollama daemon...")
-    os.kill(pid, signal.SIGKILL)
     subprocess.Popen(["pkill", "ollama"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 def run_conversion(hf_repo, ollama_repo, hf_token, progress=gr.Progress()):
@@ -150,7 +148,7 @@ def run_conversion(hf_repo, ollama_repo, hf_token, progress=gr.Progress()):
         logs.append(format_log("⚙️ Converting to BF16..."))
         yield "\n".join(logs)
-        cmd = ["python3", str(CONVERT_SCRIPT), str(model_path), "--outtype", "bf16", "--outfile", str(bf16_path)]
         result = subprocess.run(cmd, capture_output=True, text=True)
         if result.returncode == 0:
@@ -166,7 +164,7 @@ def run_conversion(hf_repo, ollama_repo, hf_token, progress=gr.Progress()):
         logs.append(format_log("⚙️ Converting to FP16 (Master)..."))
         yield "\n".join(logs)
-        cmd = ["python3", str(CONVERT_SCRIPT), str(model_path), "--outtype", "f16", "--outfile", str(fp16_path)]
         subprocess.run(cmd, check=True, capture_output=True)
         logs.extend(push_to_ollama(fp16_path, ollama_repo, "f16"))
@@ -209,6 +207,7 @@ def run_pipeline(hf_repo, ollama_repo, hf_token, ollama_key, progress=gr.Progres
     # We yield from the generator
     for update in run_conversion(hf_repo, ollama_repo, hf_token, progress):
         yield update
     stop_ollama_daemon(pid)

     logs.append(format_log(f"🐳 Creating Ollama build: {ollama_tag}"))
     try:
         create_cmd = ["ollama", "create", ollama_tag, "-f", str(modelfile_path)]
         subprocess.run(create_cmd, check=True, capture_output=True)
         if modelfile_path.exists():
             os.remove(modelfile_path)
         logs.append(format_log(f"⬆️ Pushing to registry: {ollama_tag}..."))
         push_cmd = ["ollama", "push", ollama_tag]
         push_result = subprocess.run(push_cmd, capture_output=True, text=True)
         else:
             logs.append(format_log(f"❌ Push failed: {push_result.stderr}"))
+        # Remove the local tag to save disk space in the container
         subprocess.run(["ollama", "rm", ollama_tag], stdout=subprocess.DEVNULL)
     except subprocess.CalledProcessError as e:
     process = subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env)
     pid = process.pid
     logs.append(format_log("⏳ Starting Ollama daemon in background..."))
+    sleep(2)
+    return pid, logs
 def stop_ollama_daemon(pid):
     print("⏳ Stopping Ollama daemon...")
+    logs.append(format_log("⏳ Stopping Ollama daemon..."))
+    os.kill(pid, signal.SIGQUIT)
     subprocess.Popen(["pkill", "ollama"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    return logs
 def run_conversion(hf_repo, ollama_repo, hf_token, progress=gr.Progress()):
         logs.append(format_log("⚙️ Converting to BF16..."))
         yield "\n".join(logs)
+        cmd = [str(CONVERT_SCRIPT), str(model_path), "--outtype", "bf16", "--outfile", str(bf16_path)]
         result = subprocess.run(cmd, capture_output=True, text=True)
         if result.returncode == 0:
         logs.append(format_log("⚙️ Converting to FP16 (Master)..."))
         yield "\n".join(logs)
+        cmd = [str(CONVERT_SCRIPT), str(model_path), "--outtype", "f16", "--outfile", str(fp16_path)]
         subprocess.run(cmd, check=True, capture_output=True)
         logs.extend(push_to_ollama(fp16_path, ollama_repo, "f16"))
     # We yield from the generator
     for update in run_conversion(hf_repo, ollama_repo, hf_token, progress):
         yield update
+    sleep(10)
     stop_ollama_daemon(pid)

start.sh CHANGED Viewed

@@ -1,24 +1,12 @@
 #!/bin/bash
-export CMAKE_CUDA_ARCHITECTURES="all"
-cd /app && \
-git clone --recursive https://github.com/ggerganov/llama.cpp && \
-cd llama.cpp && \
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON && \
-cmake --build build --config Release --parallel 32 && \
-cp ./build/bin/llama-* /usr/bin/ && \
-hf upload --repo-type model lainlives/ztestzz ./build/bin/
-cp convert_hf_to_gguf.py /app/convert_hf_to_gguf && \
-rm -rf build && \
-cd ..
-python3 /app/tmp.py
-ollama serve & # >/dev/null 2>&1 &
-PID="$!"
-disown "$PID"
 python3 app.py

 #!/bin/bash
+# ollama serve & # >/dev/null 2>&1 &
+# PID="$!"
+# disown "$PID"
 python3 app.py