1

I am integrating Llama Cpp Python library to run huggingface LLMs on local, I am able to generate the output of text but i would like to add streaming to my chatbot so as soon as the generation is started gradio starts to get text.

Here is my code:

import os, torch, argparse
from threading import Thread
from typing import Optional

import gradio as gr
from llama_cpp import Llama
from src import quantize
from langchain import PromptTemplate, LLMChain
from langchain.llms.base import LLM
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from core import list_download_models, remove_dir, default_repo_id, read_config, update_config
from modelsui import create_models_ui
import sys

def snapshot_download_and_convert_to_gguf(repo_id):
gguf_model_path = quantize.quantize_model(repo_id)
return gguf_model_path

def init_llm_chain(model_path):
    llm = LlamaCpp(
        model_path=model_path,
        n_ctx=6000,
        n_batch=30,
        # temperature=0.9,
        # max_tokens=4095,
        n_parts=1,
        callback_manager=callback_manager, 
        verbose=True)

    template = """Question: {question}
        Answer: Let's work this out in a step by step way to be sure we have the right answer."""
    
    prompt = PromptTemplate.from_template(template)
    llm_chain = prompt | llm
    return llm_chain, llm

model_path = snapshot_download_and_convert_to_gguf(default_repo_id)
with gr.Blocks(css='style.css') as demo:
    with gr.Tab("Chat"):
        with gr.Row():
            with gr.Column(scale=1):
            with gr.Column(scale=4):
                with gr.Group():
                    chatbot = gr.Chatbot(elem_id="chatbot-container")
                    msg = gr.Textbox(label="Prompt")
                    stop = gr.Button("Stop")
    
    llm_chain, llm = init_llm_chain(model_path)
    
    def user(user_message, history):
        return "", history + [[user_message, None]]
    
    def bot(history):
        print("Question: ", history[-1][0])
        output = llm_chain.invoke({"question": history[-1][0]})
        print("stream:", output)
        history[-1][1] = ""
        for character in output:
            print(character)
            history[-1][1] += character
            yield history
    
    submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)

demo.queue()
demo.launch(server_name=args.host, server_port=args.port, share=args.share)

I have tried to create streaming chatbot but none of the methods works.

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.