Braszczynski commited on
Commit
1ffa7d1
·
verified ·
1 Parent(s): cbac2a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -18
app.py CHANGED
@@ -1,29 +1,23 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- from transformers import AutoModel, AutoTokenizer
4
  from adapters import AutoAdapterModel
5
 
6
-
7
  model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
8
 
 
 
9
 
10
- tokenizer = AutoTokenizer.from_pretrained(model_name, load_in_4bit = True)
11
-
12
- # Load the base model with adapters
13
- model = AutoAdapterModel.from_pretrained(model_name)
14
 
 
15
  model.load_adapter("Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps")
16
 
 
 
17
 
18
-
19
- def respond(
20
- message,
21
- history: list[tuple[str, str]],
22
- system_message,
23
- max_tokens,
24
- temperature,
25
- top_p,
26
- ):
27
  # Combine system message and chat history
28
  chat_history = f"{system_message}\n"
29
  for user_msg, bot_reply in history:
@@ -33,10 +27,10 @@ def respond(
33
  # Tokenize the input
34
  inputs = tokenizer(chat_history, return_tensors="pt", truncation=True).to("cuda")
35
 
36
- # Generate response
37
  outputs = model.generate(
38
  inputs["input_ids"],
39
- max_new_tokens=max_tokens,
40
  temperature=temperature,
41
  top_p=top_p,
42
  pad_token_id=tokenizer.eos_token_id
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer
4
  from adapters import AutoAdapterModel
5
 
 
6
  model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
7
 
8
+ # Load tokenizer with 4-bit quantization
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name, load_in_4bit=True)
10
 
11
+ # Load the base model with adapters, ensuring it's loaded in 4-bit
12
+ model = AutoAdapterModel.from_pretrained(model_name, load_in_4bit=True).to("cpu")
 
 
13
 
14
+ # Load the adapter
15
  model.load_adapter("Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps")
16
 
17
+ # Free up unused memory
18
+ torch.cuda.empty_cache()
19
 
20
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
 
 
 
 
 
 
 
 
21
  # Combine system message and chat history
22
  chat_history = f"{system_message}\n"
23
  for user_msg, bot_reply in history:
 
27
  # Tokenize the input
28
  inputs = tokenizer(chat_history, return_tensors="pt", truncation=True).to("cuda")
29
 
30
+ # Generate response with reduced max tokens if necessary
31
  outputs = model.generate(
32
  inputs["input_ids"],
33
+ max_new_tokens=max_tokens, # Consider setting a lower default
34
  temperature=temperature,
35
  top_p=top_p,
36
  pad_token_id=tokenizer.eos_token_id