from transformers import AutoModelForCausalLM, AutoTokenizer import time import datetime import streamlit as streamlit question = "Name the planets in the solar system? A: " question = "Quais são os planetas do sistema solar?" question = "Qual é o maior planeta do sistema solar?" before = datetime.datetime.now() # Use a pipeline as a high-level helper from transformers import pipeline messages = [ {"role": "user", "content": question}, ] print('gerando a saida...') pipe = pipeline("text-generation", model="01-ai/Yi-1.5-34B-Chat") output = pipe(messages) st.write(output) # print('tokenizando...') # tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) # print('tokenizado.') # print('carregando o modelo...') # # Since transformers 4.35.0, the GPT-Q/AWQ model can be loaded using AutoModelForCausalLM. # model = AutoModelForCausalLM.from_pretrained( # model_path, # device_map="auto", # torch_dtype='auto' # ).eval() # print('modelo carreegado.') # # Prompt content: "hi" # messages = [ # {"role": "user", "content": question} # ] # print('tokenizando o prompt...') # input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors='pt') # print('prompt tokenizado.') # print('gerando a saida...') # output_ids = model.generate(input_ids, eos_token_id=tokenizer.eos_token_id, # max_new_tokens=10) #10 # 45 # # max_new_tokens=22) print('saida gerada.') # print('Decodificando a saida...') # response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True) # print('saida decodificada.') # Model response: "Hello! How can I assist you today?" # print(response) # question = output['choices'][0]['text'].split('A:')[0] # answer = output['choices'][0]['text'].split('A:')[1] # answer = 'A: ' + answer print('\n\n') print(question) print(response) after = datetime.datetime.now() current_time = (after - before) # .strftime("%H:%M:%S") print("\nTime Elapsed: ", current_time)