I am integrating Llama Cpp Python library to run huggingface LLMs on local, I am able to generate the output of text but i would like to add streaming to my chatbot so as soon as the generation is started gradio starts to get text.
Here is my code:
import os, torch, argparse
from threading import Thread
from typing import Optional
import gradio as gr
from llama_cpp import Llama
from src import quantize
from langchain import PromptTemplate, LLMChain
from langchain.llms.base import LLM
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from core import list_download_models, remove_dir, default_repo_id, read_config, update_config
from modelsui import create_models_ui
import sys
def snapshot_download_and_convert_to_gguf(repo_id):
gguf_model_path = quantize.quantize_model(repo_id)
return gguf_model_path
def init_llm_chain(model_path):
llm = LlamaCpp(
model_path=model_path,
n_ctx=6000,
n_batch=30,
# temperature=0.9,
# max_tokens=4095,
n_parts=1,
callback_manager=callback_manager,
verbose=True)
template = """Question: {question}
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
prompt = PromptTemplate.from_template(template)
llm_chain = prompt | llm
return llm_chain, llm
model_path = snapshot_download_and_convert_to_gguf(default_repo_id)
with gr.Blocks(css='style.css') as demo:
with gr.Tab("Chat"):
with gr.Row():
with gr.Column(scale=1):
with gr.Column(scale=4):
with gr.Group():
chatbot = gr.Chatbot(elem_id="chatbot-container")
msg = gr.Textbox(label="Prompt")
stop = gr.Button("Stop")
llm_chain, llm = init_llm_chain(model_path)
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history):
print("Question: ", history[-1][0])
output = llm_chain.invoke({"question": history[-1][0]})
print("stream:", output)
history[-1][1] = ""
for character in output:
print(character)
history[-1][1] += character
yield history
submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
demo.queue()
demo.launch(server_name=args.host, server_port=args.port, share=args.share)
I have tried to create streaming chatbot but none of the methods works.