Package gguf_llama
Expand source code
from .gguf_llama import LlamaAI
__all__ = ['LlamaAI']
Sub-modules
gguf_llama.gguf_llama
Classes
class LlamaAI (model_gguf_path: str, max_tokens: int, **llama_kwrgs: Any)
-
A class for interfacing with Llama models.
Attributes
model_path
:str
- The path to the Llama model file.
max_tokens
:int
- The maximum number of tokens to generate.
max_input_tokens
:int
- The maximum number of tokens allowed in the input text.
llm
:Llama
- The loaded Llama model instance.
tokenizer
:LlamaTokenizer
- The tokenizer for encoding/decoding text.
_loaded
:bool
- Whether the model is loaded.
_llama_kwrgs
:dict
- Additional kwargs to pass when loading Llama model.
Initialize the LlamaAI instance.
Args
model_gguf_path
:str
- Path to .gguf model file.
max_tokens
:int
- Max tokens to be processed
llama_kwrgs
- Additional kwargs for Llama model compatible with llama-cpp-python BE
Expand source code
class LlamaAI: """ A class for interfacing with Llama models. Attributes: model_path (str): The path to the Llama model file. max_tokens (int): The maximum number of tokens to generate. max_input_tokens (int): The maximum number of tokens allowed in the input text. llm (Llama): The loaded Llama model instance. tokenizer (LlamaTokenizer): The tokenizer for encoding/decoding text. _loaded (bool): Whether the model is loaded. _llama_kwrgs (dict): Additional kwargs to pass when loading Llama model. """ def __init__(self, model_gguf_path:str, max_tokens:int, **llama_kwrgs:Any) -> None: """ Initialize the LlamaAI instance. Args: model_gguf_path (str): Path to .gguf model file. max_tokens (int): Max tokens to be processed llama_kwrgs: Additional kwargs for Llama model compatible with llama-cpp-python BE """ self.model_path = model_gguf_path self.max_tokens = max_tokens self._max_input_tokens = None self.llm = None self.tokenizer = None self._loaded = False self._llama_kwrgs = llama_kwrgs self.load() def load(self) -> None: """ Load the Llama model and tokenizer based on initialized attributes. Sets the llm and tokenizer attributes. Sets _loaded to True once complete. """ print(f"Loading model from {self.model_path}...") self.llm = Llama(model_path=self.model_path, verbose=False, n_ctx=self.max_tokens, kwargs=self._llama_kwrgs) self.tokenizer = LlamaTokenizer(self.llm) self._loaded = True def _try_fixing_format(self, text: str, only_letters: bool = False, rem_list_formatting: bool = False) -> str: """ Attempt to fix formatting issues in the input text. Removes extra newlines, non-letter characters, and list formatting. Prints a message if changes are made. Args: text (str): The input text to fix. only_letters (bool): Whether to remove all non-letters. rem_list_formatting (bool): Whether to remove list formatting. Returns: str: The fixed text. """ print("Trying to fix formatting... this might have some undersired effects") changes = False if "\n\n" in text: #split text in that place core_info = text.split("\n\n")[1:] text = " ".join(core_info) changes = True if "\n" in text: text = text.replace("\n", " ") changes = True if only_letters: text = remove_non_letters(text) changes = True if rem_list_formatting: text = remove_list_formatting(text) if changes: print("The text has been sucessfully modified.") return text def _check_loaded(self) -> None: """ Check if the model is loaded, load it if not. Raises an exception if loading fails. """ if not self._loaded: try: self.load() raise Warning("Model not loaded, trying a default reload...") except: raise Exception("Model not loaded! Please provide model settings when creating the class or use load_model method after creation.") def _set_total_token_limit(self, new_max_tokens: int) -> None: """ Adjust the max_tokens attribute. Args: new_max_tokens (int): The new max tokens value. Sets _loaded to False to trigger reloading. """ self.max_tokens = new_max_tokens self._loaded = False def _set_input_token_limit(self, new_max_input_tokens: int=None) -> None: """ Adjust the max_input_tokens attribute. Args: new_max_input_tokens (int): The new max input tokens value. Raises an exception if the new value is less than max_tokens. """ if new_max_input_tokens is None or (new_max_input_tokens is not None and new_max_input_tokens <= 0): self._max_input_tokens = None print("Max input tokens limit cleared.") elif new_max_input_tokens < self.max_tokens: raise Exception("The new maximum input tokens must be greater than the current maximum tokens.") elif self._max_input_tokens is None or new_max_input_tokens != self._max_input_tokens: self._max_input_tokens = new_max_input_tokens def set_max_tokens(self, new_max_tokens: int, max_input_tokens_limit:Optional[int]=None) -> None: """ Adjust both the max tokens and max input tokens. Args: new_max_tokens (int): New max tokens value. new_max_input_tokens (int): New max input tokens value. Calls _adjust methods to update attributes. Reloads the model after adjusting. """ self._set_total_token_limit(new_max_tokens) self._set_input_token_limit(max_input_tokens_limit) self.load() def tokenize(self, text: str) -> list: """ Tokenize the input text using the loaded tokenizer. Args: text (str): The text to tokenize. Returns: list: The list of tokenized tokens. """ ts = self.tokenizer.encode(text) return ts def untokenize(self, tokens: list) -> str: """ Decode a list of tokens back into a string. Args: tokens (list): The tokens to untokenize. Returns: str: The decoded string. """ return self.tokenizer.decode(tokens) def count_tokens(self, text: str) -> int: """ Count the number of tokens needed to tokenize the text. Args: text (str): The text to count tokens for. Returns: int: The number of tokens. """ return len(self.tokenize(text)) def is_prompt_within_limit(self, text: str) -> bool: """ Check if the text is within the max input tokens limit. Args: text (str): The text to check. Returns: bool: True if under input token limit, False otherwise. """ tcc = self.count_tokens(text) print(f"Input length: {tcc} tokens") if self._max_input_tokens is not None: r = self.count_tokens(text) <= self.max_tokens print(f"Max input length set to: {self._max_input_tokens} tokens") else: r = self.count_tokens(text) <= self.max_tokens print(f"Max input length not set, using max tokens: {self.max_tokens} tokens") return r def clear_input_tokens_limit(self) -> None: """ Clear the max input tokens limit. """ self._set_input_token_limit(None) def infer(self, text:str, only_string: bool = True, stop_at_str=None, include_stop_str=True) -> Union[str, dict]: """ Generate inference text for the input prompt. Args: text (str): The prompt text. only_string (bool): Whether to return just text or OpenAI object. stop_at_str (str): The string to stop at. include_stop_str (bool): Whether to include the stop string in the output. Returns: str/list: The generated text or OpenAI inference object. Raises an exception if the text is too long and no max tokens provided. Adjusts model tokens if needed to fit prompt. """ text = str(text) self._check_loaded() if not self.is_prompt_within_limit(text): raise Exception("Text is too long!") else: stop_at = None if any([stop_at_str is None, stop_at_str == ""]) else stop_at_str output:dict = self.llm(text, max_tokens=self.max_tokens, stop=stop_at) if only_string: output = self._text_from_inference_obj(output) if include_stop_str: output += stop_at_str if stop_at_str is not None else "" return output def _text_from_inference_obj(self, answer_dict: dict) -> str: if 'choices' in answer_dict and 'text' in answer_dict['choices'][0]: extracted_answ = answer_dict['choices'][0]['text'] return extracted_answ
Methods
def clear_input_tokens_limit(self) ‑> None
-
Clear the max input tokens limit.
Expand source code
def clear_input_tokens_limit(self) -> None: """ Clear the max input tokens limit. """ self._set_input_token_limit(None)
def count_tokens(self, text: str) ‑> int
-
Count the number of tokens needed to tokenize the text.
Args
text
:str
- The text to count tokens for.
Returns
int
- The number of tokens.
Expand source code
def count_tokens(self, text: str) -> int: """ Count the number of tokens needed to tokenize the text. Args: text (str): The text to count tokens for. Returns: int: The number of tokens. """ return len(self.tokenize(text))
def infer(self, text: str, only_string: bool = True, stop_at_str=None, include_stop_str=True) ‑> Union[str, dict]
-
Generate inference text for the input prompt.
Args
text
:str
- The prompt text.
only_string
:bool
- Whether to return just text or OpenAI object.
stop_at_str
:str
- The string to stop at.
include_stop_str
:bool
- Whether to include the stop string in the output.
Returns
str/list: The generated text or OpenAI inference object. Raises an exception if the text is too long and no max tokens provided. Adjusts model tokens if needed to fit prompt.
Expand source code
def infer(self, text:str, only_string: bool = True, stop_at_str=None, include_stop_str=True) -> Union[str, dict]: """ Generate inference text for the input prompt. Args: text (str): The prompt text. only_string (bool): Whether to return just text or OpenAI object. stop_at_str (str): The string to stop at. include_stop_str (bool): Whether to include the stop string in the output. Returns: str/list: The generated text or OpenAI inference object. Raises an exception if the text is too long and no max tokens provided. Adjusts model tokens if needed to fit prompt. """ text = str(text) self._check_loaded() if not self.is_prompt_within_limit(text): raise Exception("Text is too long!") else: stop_at = None if any([stop_at_str is None, stop_at_str == ""]) else stop_at_str output:dict = self.llm(text, max_tokens=self.max_tokens, stop=stop_at) if only_string: output = self._text_from_inference_obj(output) if include_stop_str: output += stop_at_str if stop_at_str is not None else "" return output
def is_prompt_within_limit(self, text: str) ‑> bool
-
Check if the text is within the max input tokens limit.
Args
text
:str
- The text to check.
Returns: bool: True if under input token limit, False otherwise.
Expand source code
def is_prompt_within_limit(self, text: str) -> bool: """ Check if the text is within the max input tokens limit. Args: text (str): The text to check. Returns: bool: True if under input token limit, False otherwise. """ tcc = self.count_tokens(text) print(f"Input length: {tcc} tokens") if self._max_input_tokens is not None: r = self.count_tokens(text) <= self.max_tokens print(f"Max input length set to: {self._max_input_tokens} tokens") else: r = self.count_tokens(text) <= self.max_tokens print(f"Max input length not set, using max tokens: {self.max_tokens} tokens") return r
def load(self) ‑> None
-
Load the Llama model and tokenizer based on initialized attributes.
Sets the llm and tokenizer attributes. Sets _loaded to True once complete.
Expand source code
def load(self) -> None: """ Load the Llama model and tokenizer based on initialized attributes. Sets the llm and tokenizer attributes. Sets _loaded to True once complete. """ print(f"Loading model from {self.model_path}...") self.llm = Llama(model_path=self.model_path, verbose=False, n_ctx=self.max_tokens, kwargs=self._llama_kwrgs) self.tokenizer = LlamaTokenizer(self.llm) self._loaded = True
def set_max_tokens(self, new_max_tokens: int, max_input_tokens_limit: Optional[int] = None) ‑> None
-
Adjust both the max tokens and max input tokens.
Args
new_max_tokens
:int
- New max tokens value.
new_max_input_tokens
:int
- New max input tokens value.
Calls _adjust methods to update attributes. Reloads the model after adjusting.
Expand source code
def set_max_tokens(self, new_max_tokens: int, max_input_tokens_limit:Optional[int]=None) -> None: """ Adjust both the max tokens and max input tokens. Args: new_max_tokens (int): New max tokens value. new_max_input_tokens (int): New max input tokens value. Calls _adjust methods to update attributes. Reloads the model after adjusting. """ self._set_total_token_limit(new_max_tokens) self._set_input_token_limit(max_input_tokens_limit) self.load()
def tokenize(self, text: str) ‑> list
-
Tokenize the input text using the loaded tokenizer.
Args
text
:str
- The text to tokenize.
Returns
list
- The list of tokenized tokens.
Expand source code
def tokenize(self, text: str) -> list: """ Tokenize the input text using the loaded tokenizer. Args: text (str): The text to tokenize. Returns: list: The list of tokenized tokens. """ ts = self.tokenizer.encode(text) return ts
def untokenize(self, tokens: list) ‑> str
-
Decode a list of tokens back into a string.
Args
tokens
:list
- The tokens to untokenize.
Returns
str
- The decoded string.
Expand source code
def untokenize(self, tokens: list) -> str: """ Decode a list of tokens back into a string. Args: tokens (list): The tokens to untokenize. Returns: str: The decoded string. """ return self.tokenizer.decode(tokens)