1 gadu atpakaļ · 99f78f7486
--- a/README.md
+++ b/README.md
--- a/docs/images/messenger_llama_arch.jpg
+++ b/docs/images/messenger_llama_arch.jpg
--- a/docs/images/whatsapp_llama_arch.jpg
+++ b/docs/images/whatsapp_llama_arch.jpg
--- a/recipes/evaluation/README.md
+++ b/recipes/evaluation/README.md
--- a/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb
+++ b/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb
@@ -251,7 +251,7 @@
 
				     "        get_peft_model,\n",
			
 
				     "        LoraConfig,\n",
			
 
				     "        TaskType,\n",
			
 
				-    "        prepare_model_for_int8_training,\n",
			
 
				+    "        prepare_model_for_kbit_training,\n",
			
 
				     "    )\n",
			
 
				     "\n",
			
 
				     "    peft_config = LoraConfig(\n",
			
@@ -264,7 +264,7 @@
 
				     "    )\n",
			
 
				     "\n",
			
 
				     "    # prepare int-8 model for training\n",
			
 
				-    "    model = prepare_model_for_int8_training(model)\n",
			
 
				+    "    model = prepare_model_for_kbit_training(model)\n",
			
 
				     "    model = get_peft_model(model, peft_config)\n",
			
 
				     "    model.print_trainable_parameters()\n",
			
 
				     "    return model, peft_config\n",
			
--- a/recipes/inference/local_inference/inference.py
+++ b/recipes/inference/local_inference/inference.py
@@ -31,7 +31,7 @@ def main(
 
				     temperature: float=1.0, # [optional] The value used to modulate the next token probabilities.
			
 
				     top_k: int=50, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
			
 
				     repetition_penalty: float=1.0, #The parameter for repetition penalty. 1.0 means no penalty.
			
 
				-    length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation. 
			
 
				+    length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation.
			
 
				     enable_azure_content_safety: bool=False, # Enable safety check with Azure content safety api
			
 
				     enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs
			
 
				     enable_salesforce_content_safety: bool=True, # Enable safety check with Salesforce safety flan t5
			
@@ -98,12 +98,12 @@ def main(
 
				             top_k=top_k,
			
 
				             repetition_penalty=repetition_penalty,
			
 
				             length_penalty=length_penalty,
			
 
				-            **kwargs 
			
 
				+            **kwargs
			
 
				         )
			
 
				     e2e_inference_time = (time.perf_counter()-start)*1000
			
 
				     print(f"the inference time is {e2e_inference_time} ms")
			
 
				     output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
			
 
				-    
			
 
				+
			
 
				     # Safety check of the model output
			
 
				     safety_results = [check(output_text, agent_type=AgentType.AGENT, user_prompt=user_prompt) for check in safety_checker]
			
 
				     are_safe = all([r[1] for r in safety_results])
			
@@ -156,7 +156,7 @@ def main(
 
				                 label="Output",
			
 
				             )
			
 
				         ],
			
 
				-        title="Llama2 Playground",
			
 
				+        title="Meta Llama3 Playground",
			
 
				         description="https://github.com/facebookresearch/llama-recipes",
			
 
				       ).queue().launch(server_name="0.0.0.0", share=True)
			
 
				 
			
--- a/recipes/inference/model_servers/llama-on-prem.md
+++ b/recipes/inference/model_servers/llama-on-prem.md
--- a/recipes/quickstart/Getting_to_know_Llama.ipynb
+++ b/recipes/quickstart/Getting_to_know_Llama.ipynb
--- a/recipes/quickstart/Running_Llama2_Anywhere/Running_Llama_on_Mac.ipynb
+++ b/recipes/quickstart/Running_Llama2_Anywhere/Running_Llama_on_Mac.ipynb
@@ -1,219 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## Running Llama2 on Mac\n",
			
 
				-    "This notebook goes over how you can set up and run Llama2 locally on a Mac using llama-cpp-python and the llama-cpp's quantized Llama2 model. It also goes over how to use LangChain to ask Llama general questions"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Steps at a glance:\n",
			
 
				-    "1. Use CMAKE and install required packages\n",
			
 
				-    "2. Request download of model weights from the Llama website\n",
			
 
				-    "3. Clone the llama repo and get the weights\n",
			
 
				-    "4. Clone the llamacpp repo and quantize the model\n",
			
 
				-    "5. Prepare the script\n",
			
 
				-    "6. Run the example\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "<br>\n",
			
 
				-    "\n",
			
 
				-    "#### 1. Use CMAKE and install required packages\n",
			
 
				-    "\n",
			
 
				-    "Type the following command:"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "#CMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1: sets the appropriate build configuration options for the llama-cpp-python package \n",
			
 
				-    "#and enables the use of Metal in Mac and forces the use of CMake as the build system.\n",
			
 
				-    "!CMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1 pip install llama-cpp-python\n",
			
 
				-    "\n",
			
 
				-    "#pip install llama-cpp-python: installs the llama-cpp-python package and its dependencies:\n",
			
 
				-    "!pip install pypdf sentence-transformers chromadb langchain"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "If running without a Jupyter notebook, use the command without the `!`"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "A brief look at the installed libraries:\n",
			
 
				-    "- [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) a simple Python bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) library\n",
			
 
				-    "- pypdf gives us the ability to work with pdfs\n",
			
 
				-    "- sentence-transformers for text embeddings\n",
			
 
				-    "- chromadb gives us database capabilities \n",
			
 
				-    "- langchain provides necessary RAG tools for this demo"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "<br>\n",
			
 
				-    "\n",
			
 
				-    "#### 2. Request download of model weights from the Llama website\n",
			
 
				-    "Before you can run the model locally, you will need to get the model weights. To get the model weights, visit the [Llama website](https://llama.meta.com/) and click on “download models”. \n",
			
 
				-    "Fill  the required information, select the models “Llama 2 & Llama Chat” and accept the terms & conditions. You will receive a URL in your email in a short time.\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "<br>\n",
			
 
				-    "\n",
			
 
				-    "#### 3. Clone the llama repo and get the weights\n",
			
 
				-    "Git clone the [Llama repo](https://github.com/facebookresearch/llama.git). Enter the URL and get 13B weights. This example demonstrates a llama2 model with 13B parameters, but the steps we follow would be similar for other llama models, as well as for other parameter models.\n",
			
 
				-    "\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "<br>\n",
			
 
				-    "\n",
			
 
				-    "#### 4. Clone the llamacpp repo and quantize the model\n",
			
 
				-    "* Git clone the [Llamacpp repo](https://github.com/ggerganov/llama.cpp). \n",
			
 
				-    "* Enter the repo:\n",
			
 
				-    "`cd llama.cpp`\n",
			
 
				-    "* Install requirements:\n",
			
 
				-    "`python3 -m pip install -r requirements.txt`\n",
			
 
				-    "* Convert the weights:\n",
			
 
				-    "`python convert.py <path_to_your_downloaded_llama-2-13b_model>`\n",
			
 
				-    "* Run make to generate the 'quantize' method that we will use in the next step\n",
			
 
				-    "`make`\n",
			
 
				-    "* Quantize the weights:\n",
			
 
				-    "`./quantize <path_to_your_downloaded_llama-2-13b_model>/ggml-model-f16.gguf <path_to_your_downloaded_llama-2-13b_model>/ggml-model-q4_0.gguf q4_0`"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "\n",
			
 
				-    "#### 5. Prepare the script\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# mentions the instance of the Llama model that we will use\n",
			
 
				-    "from langchain.llms import LlamaCpp\n",
			
 
				-    "\n",
			
 
				-    "# defines a chain of operations that can be performed on text input to generate the output using the LLM\n",
			
 
				-    "from langchain.chains import LLMChain\n",
			
 
				-    "\n",
			
 
				-    "# manages callbacks that are triggered at various stages during the execution of an LLMChain\n",
			
 
				-    "from langchain.callbacks.manager import CallbackManager\n",
			
 
				-    "\n",
			
 
				-    "# defines a callback that streams the output of the LLMChain to the console in real-time as it gets generated\n",
			
 
				-    "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
			
 
				-    "\n",
			
 
				-    "# allows to define prompt templates that can be used to generate custom inputs for the LLM\n",
			
 
				-    "from langchain.prompts import PromptTemplate\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "# Initialize the langchain CallBackManager. This handles callbacks from Langchain and for this example we will use \n",
			
 
				-    "# for token-wise streaming so you'll see the answer gets generated token by token when Llama is answering your question\n",
			
 
				-    "callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])\n",
			
 
				-    "\n",
			
 
				-    "# Set up the model\n",
			
 
				-    "llm = LlamaCpp(\n",
			
 
				-    "    model_path=\"<path-to-llama-gguf-file>\",\n",
			
 
				-    "    temperature=0.0,\n",
			
 
				-    "    top_p=1,\n",
			
 
				-    "    n_ctx=6000,\n",
			
 
				-    "    callback_manager=callback_manager, \n",
			
 
				-    "    verbose=True,\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "#### 6. Run the example\n",
			
 
				-    "\n",
			
 
				-    "With the model set up, you are now ready to ask some questions. \n",
			
 
				-    "\n",
			
 
				-    "Here is an example of the simplest way to ask the model some general questions."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Run the example\n",
			
 
				-    "question = \"who wrote the book Pride and Prejudice?\"\n",
			
 
				-    "answer = llm(question)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Alternatively, you can use LangChain's `PromptTemplate` for some flexibility in your prompts and questions. For more information on LangChain's prompt template visit this [link](https://python.langchain.com/docs/modules/model_io/prompts/prompt_templates/)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "prompt = PromptTemplate.from_template(\n",
			
 
				-    "    \"who wrote {book}?\"\n",
			
 
				-    ")\n",
			
 
				-    "chain = LLMChain(llm=llm, prompt=prompt)\n",
			
 
				-    "answer = chain.run(\"A tale of two cities\")"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.8.3"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 2
			
 
				-}
			
--- a/recipes/quickstart/Running_Llama2_Anywhere/Running_Llama_on_HF_transformers.ipynb
+++ b/recipes/quickstart/Running_Llama2_Anywhere/Running_Llama_on_HF_transformers.ipynb
@@ -4,8 +4,8 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "## Running Llama2 on Google Colab using Hugging Face transformers library\n",
			
 
				-    "This notebook goes over how you can set up and run Llama2 using Hugging Face transformers library\n",
			
 
				+    "## Running Meta Llama 3 on Google Colab using Hugging Face transformers library\n",
			
 
				+    "This notebook goes over how you can set up and run Llama 3 using Hugging Face transformers library\n",
			
 
				     "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/Running_Llama2_Anywhere/Running_Llama_on_HF_transformers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
			
 
				    ]
			
 
				   },
			
@@ -14,11 +14,11 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "### Steps at a glance:\n",
			
 
				-    "This demo showcases how to run the example with already converted Llama 2 weights on [Hugging Face](https://huggingface.co/meta-llama). Please Note: To use the downloads on Hugging Face, you must first request a download as shown in the steps below making sure that you are using the same email address as your Hugging Face account.\n",
			
 
				+    "This demo showcases how to run the example with already converted Llama 3 weights on [Hugging Face](https://huggingface.co/meta-llama). Please Note: To use the downloads on Hugging Face, you must first request a download as shown in the steps below making sure that you are using the same email address as your Hugging Face account.\n",
			
 
				     "\n",
			
 
				     "To use already converted weights, start here:\n",
			
 
				     "1. Request download of model weights from the Llama website\n",
			
 
				-    "2. Prepare the script\n",
			
 
				+    "2. Login to Hugging Face from your terminal using the same email address as (1). Follow the instructions [here](https://huggingface.co/docs/huggingface_hub/en/quick-start). \n",
			
 
				     "3. Run the example\n",
			
 
				     "\n",
			
 
				     "\n",
			
@@ -45,7 +45,7 @@
 
				     "Request download of model weights from the Llama website\n",
			
 
				     "Before you can run the model locally, you will need to get the model weights. To get the model weights, visit the [Llama website](https://llama.meta.com/) and click on “download models”. \n",
			
 
				     "\n",
			
 
				-    "Fill  the required information, select the models “Llama 2 & Llama Chat” and accept the terms & conditions. You will receive a URL in your email in a short time."
			
 
				+    "Fill  the required information, select the models “Meta Llama 3” and accept the terms & conditions. You will receive a URL in your email in a short time."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -79,7 +79,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				+   "execution_count": 2,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -92,7 +92,12 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 7b chat model `meta-llama/Llama-2-7b-chat-hf`."
			
 
				+    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
			
 
				+    "\n",
			
 
				+    "1. Accept Terms of Service for Meta Llama 3 on Meta [website](https://llama.meta.com/llama-downloads).\n",
			
 
				+    "2. Use the same email address from Step (1) to login into Hugging Face.\n",
			
 
				+    "\n",
			
 
				+    "Follow the instructions on this Hugging Face page to login from your [terminal](https://huggingface.co/docs/huggingface_hub/en/quick-start). "
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -101,7 +106,26 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "model = \"meta-llama/Llama-2-7b-chat-hf\"\n",
			
 
				+    "pip install --upgrade huggingface_hub"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from huggingface_hub import login\n",
			
 
				+    "login()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "model = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
			
 
				     "tokenizer = AutoTokenizer.from_pretrained(model)"
			
 
				    ]
			
 
				   },
			
@@ -174,7 +198,7 @@
 
				     "Request download of model weights from the Llama website\n",
			
 
				     "Before you can run the model locally, you will need to get the model weights. To get the model weights, visit the [Llama website](https://llama.meta.com/) and click on “download models”. \n",
			
 
				     "\n",
			
 
				-    "Fill  the required information, select the models “Llama 2 & Llama Chat” and accept the terms & conditions. You will receive a URL in your email in a short time.\n"
			
 
				+    "Fill  the required information, select the models \"Meta Llama 3\" and accept the terms & conditions. You will receive a URL in your email in a short time."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -182,25 +206,24 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "#### 2. Clone the llama repo and get the weights\n",
			
 
				-    "Git clone the [Llama repo](https://github.com/facebookresearch/llama.git). Enter the URL and get 7B-chat weights. This will download the tokenizer.model, and a directory llama-2-7b-chat with the weights in it.\n",
			
 
				+    "Git clone the [Meta Llama 3 repo](https://github.com/meta-llama/llama3). Run the `download.sh` script and follow the instructions. This will download the model checkpoints and tokenizer.\n",
			
 
				     "\n",
			
 
				-    "This example demonstrates a llama2 model with 7B-chat parameters, but the steps we follow would be similar for other llama models, as well as for other parameter models.\n",
			
 
				-    "\n"
			
 
				+    "This example demonstrates a Meta Llama 3 model with 8B-instruct parameters, but the steps we follow would be similar for other llama models, as well as for other parameter models."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "#### 3. Convert the model weights\n",
			
 
				+    "#### 3. Convert the model weights using Hugging Face transformer from source\n",
			
 
				     "\n",
			
 
				-    "* Create a link to the tokenizer:\n",
			
 
				-    "Run `ln -h ./tokenizer.model ./llama-2-7b-chat/tokenizer.model`  \n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "* Convert the model weights to run with Hugging Face:``TRANSFORM=`python -c \"import transformers;print('/'.join(transformers.__file__.split('/')[:-1])+'/models/llama/convert_llama_weights_to_hf.py')\"``\n",
			
 
				-    "\n",
			
 
				-    "* Then run: `pip install protobuf && python $TRANSFORM --input_dir ./llama-2-7b-chat --model_size 7B --output_dir ./llama-2-7b-chat-hf`\n"
			
 
				+    "* `python3 -m venv hf-convertor`\n",
			
 
				+    "* `source hf-convertor/bin/activate`\n",
			
 
				+    "* `git clone https://github.com/huggingface/transformers.git`\n",
			
 
				+    "* `cd transformers`\n",
			
 
				+    "* `pip install -e .`\n",
			
 
				+    "* `pip install torch tiktoken blobfile accelerate`\n",
			
 
				+    "* `python3 src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir ${path_to_meta_downloaded_model} --output_dir ${path_to_save_converted_hf_model} --model_size 8B --llama_version 3`"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -210,10 +233,9 @@
 
				     "\n",
			
 
				     "#### 4. Prepare the script\n",
			
 
				     "Import the following necessary modules in your script: \n",
			
 
				-    "* `LlamaForCausalLM` is the Llama 2 model class\n",
			
 
				-    "* `LlamaTokenizer` prepares your prompt for the model to process\n",
			
 
				-    "* `pipeline` is an abstraction to generate model outputs\n",
			
 
				-    "* `torch` allows us to use PyTorch and specify the datatype we’d like to use."
			
 
				+    "* `AutoModel` is the Llama 2 model class\n",
			
 
				+    "* `AutoTokenizer` prepares your prompt for the model to process\n",
			
 
				+    "* `pipeline` is an abstraction to generate model outputs"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -224,13 +246,14 @@
 
				    "source": [
			
 
				     "import torch\n",
			
 
				     "import transformers\n",
			
 
				-    "from transformers import LlamaForCausalLM, LlamaTokenizer\n",
			
 
				+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
			
 
				     "\n",
			
 
				-    "\n",
			
 
				-    "model_dir = \"./llama-2-7b-chat-hf\"\n",
			
 
				-    "model = LlamaForCausalLM.from_pretrained(model_dir)\n",
			
 
				-    "\n",
			
 
				-    "tokenizer = LlamaTokenizer.from_pretrained(model_dir)\n"
			
 
				+    "model_dir = \"${path_the_converted_hf_model}\"\n",
			
 
				+    "model = AutoModelForCausalLM.from_pretrained(\n",
			
 
				+    "        model_dir,\n",
			
 
				+    "        device_map=\"auto\",\n",
			
 
				+    "    )\n",
			
 
				+    "tokenizer = AutoTokenizer.from_pretrained(model_dir)\n"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -242,7 +265,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				+   "execution_count": 2,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -296,8 +319,16 @@
 
				    "name": "python3"
			
 
				   },
			
 
				   "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				    "name": "python",
			
 
				-   "version": "3.8.3"
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.10"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_Mac_Windows_Linux.ipynb
+++ b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_Mac_Windows_Linux.ipynb
@@ -0,0 +1,166 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Running Llama 3 on Mac, Windows or Linux\n",
			
 
				+    "This notebook goes over how you can set up and run Llama 3 locally on a Mac, Windows or Linux using [Ollama](https://ollama.com/)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Steps at a glance:\n",
			
 
				+    "1. Download and install Ollama.\n",
			
 
				+    "2. Download and test run Llama 3.\n",
			
 
				+    "3. Use local Llama 3 via Python.\n",
			
 
				+    "4. Use local Llama 3 via LangChain.\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "#### 1. Download and install Ollama\n",
			
 
				+    "\n",
			
 
				+    "On Mac or Windows, go to the Ollama download page [here](https://ollama.com/download) and select your platform to download it, then double click the downloaded file to install Ollama.\n",
			
 
				+    "\n",
			
 
				+    "On Linux, you can simply run on a terminal `curl -fsSL https://ollama.com/install.sh | sh` to download and install Ollama."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "#### 2. Download and test run Llama 3\n",
			
 
				+    "\n",
			
 
				+    "On a terminal or console, run `ollama pull llama3` to download the Llama 3 8b chat model, in the 4-bit quantized format with size about 4.7 GB.\n",
			
 
				+    "\n",
			
 
				+    "Run `ollama pull llama3:70b` to download the Llama 3 70b chat model, also in the 4-bit quantized format with size 39GB.\n",
			
 
				+    "\n",
			
 
				+    "Then you can run `ollama run llama3` and ask Llama 3 questions such as \"who wrote the book godfather?\" or \"who wrote the book godfather? answer in one sentence.\" You can also try `ollama run llama3:70b`, but the inference speed will most likely be too slow - for example, on an Apple M1 Pro with 32GB RAM, it takes over 10 seconds to generate one token using Llama 3 70b chat (vs over 10 tokens per second with Llama 3 8b chat).\n",
			
 
				+    "\n",
			
 
				+    "You can also run the following command to test Llama 3 8b chat:\n",
			
 
				+    "```\n",
			
 
				+    " curl http://localhost:11434/api/chat -d '{\n",
			
 
				+    "  \"model\": \"llama3\",\n",
			
 
				+    "  \"messages\": [\n",
			
 
				+    "    {\n",
			
 
				+    "      \"role\": \"user\",\n",
			
 
				+    "      \"content\": \"who wrote the book godfather?\"\n",
			
 
				+    "    }\n",
			
 
				+    "  ],\n",
			
 
				+    "  \"stream\": false\n",
			
 
				+    "}'\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "The complete Ollama API doc is [here](https://github.com/ollama/ollama/blob/main/docs/api.md)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "#### 3. Use local Llama 3 via Python\n",
			
 
				+    "\n",
			
 
				+    "The Python code below is the port of the curl command above."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import requests\n",
			
 
				+    "import json\n",
			
 
				+    "\n",
			
 
				+    "url = \"http://localhost:11434/api/chat\"\n",
			
 
				+    "\n",
			
 
				+    "def llama3(prompt):\n",
			
 
				+    "    data = {\n",
			
 
				+    "        \"model\": \"llama3\",\n",
			
 
				+    "        \"messages\": [\n",
			
 
				+    "            {\n",
			
 
				+    "              \"role\": \"user\",\n",
			
 
				+    "              \"content\": prompt\n",
			
 
				+    "            }\n",
			
 
				+    "        ],\n",
			
 
				+    "        \"stream\": False\n",
			
 
				+    "    }\n",
			
 
				+    "    \n",
			
 
				+    "    headers = {\n",
			
 
				+    "        'Content-Type': 'application/json'\n",
			
 
				+    "    }\n",
			
 
				+    "    \n",
			
 
				+    "    response = requests.post(url, headers=headers, json=data)\n",
			
 
				+    "    \n",
			
 
				+    "    return(response.json()['message']['content'])"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = llama3(\"who wrote the book godfather\")\n",
			
 
				+    "print(response)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "#### 4. Use local Llama 3 via LangChain\n",
			
 
				+    "\n",
			
 
				+    "Code below use LangChain with Ollama to query Llama 3 running locally. For a more advanced example of using local Llama 3 with LangChain and agent-powered RAG, see [this](https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_rag_agent_llama3_local.ipynb)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!pip install langchain"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from langchain_community.chat_models import ChatOllama\n",
			
 
				+    "\n",
			
 
				+    "llm = ChatOllama(model=\"llama3\", temperature=0)\n",
			
 
				+    "response = llm.invoke(\"who wrote the book godfather?\")\n",
			
 
				+    "print(response.content)\n"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.11.9"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/recipes/use_cases/LiveData.ipynb
+++ b/recipes/use_cases/LiveData.ipynb
--- a/recipes/use_cases/RAG/HelloLlamaCloud.ipynb
+++ b/recipes/use_cases/RAG/HelloLlamaCloud.ipynb
--- a/recipes/use_cases/RAG/HelloLlamaLocal.ipynb
+++ b/recipes/use_cases/RAG/HelloLlamaLocal.ipynb
--- a/recipes/use_cases/RAG/llama2.pdf
+++ b/recipes/use_cases/RAG/llama2.pdf
--- a/recipes/use_cases/README.md
+++ b/recipes/use_cases/README.md
@@ -1,17 +1,17 @@
 
				-## VideoSummary: Ask Llama2 to Summarize a YouTube Video (using [Replicate](VideoSummary.ipynb) or [OctoAI](../llama_api_providers/OctoAI_API_examples/VideoSummary.ipynb))
			
 
				-This demo app uses Llama2 to return a text summary of a YouTube video. It shows how to retrieve the caption of a YouTube video and how to ask Llama to summarize the content in four different ways, from the simplest naive way that works for short text to more advanced methods of using LangChain's map_reduce and refine to overcome the 4096 limit of Llama's max input token size.
			
 
				+## [VideoSummary](VideoSummary.ipynb): Ask Llama 3 to Summarize a Long YouTube Video (using Replicate or [OctoAI](../llama_api_providers/OctoAI_API_examples/VideoSummary.ipynb))
			
 
				+This demo app uses Llama 3 to return a text summary of a YouTube video. It shows how to retrieve the caption of a YouTube video and how to ask Llama to summarize the content in different ways, from the simplest naive way that works for short text to more advanced methods of using LangChain's map_reduce and refine to overcome the 8K context length limit of Llama 3.
			
 
				 
			
 
				-## [NBA2023-24](./text2sql/StructuredLlama.ipynb): Ask Llama2 about Structured Data
			
 
				-This demo app shows how to use LangChain and Llama2 to let users ask questions about **structured** data stored in a SQL DB. As the 2023-24 NBA season is around the corner, we use the NBA roster info saved in a SQLite DB to show you how to ask Llama2 questions about your favorite teams or players.
			
 
				+## [NBA2023-24](./text2sql/StructuredLlama.ipynb): Ask Llama 3 about Structured Data
			
 
				+This demo app shows how to use LangChain and Llama 3 to let users ask questions about **structured** data stored in a SQL DB. As the 2023-24 NBA season is entering the playoff, we use the NBA roster info saved in a SQLite DB to show you how to ask Llama 3 questions about your favorite teams or players.
			
 
				 
			
 
				-## LiveData: Ask Llama2 about Live Data (using [Replicate](LiveData.ipynb) or [OctoAI](../llama_api_providers/OctoAI_API_examples/LiveData.ipynb))
			
 
				-This demo app shows how to perform live data augmented generation tasks with Llama2 and [LlamaIndex](https://github.com/run-llama/llama_index), another leading open-source framework for building LLM apps: it uses the [You.com search API](https://documentation.you.com/quickstart) to get live search result and ask Llama2 about them.
			
 
				+## [LiveData](LiveData.ipynb): Ask Llama 3 about Live Data (using Replicate or [OctoAI](../llama_api_providers/OctoAI_API_examples/LiveData.ipynb))
			
 
				+This demo app shows how to perform live data augmented generation tasks with Llama 3, [LlamaIndex](https://github.com/run-llama/llama_index), another leading open-source framework for building LLM apps, and the [Tavily](https://tavily.com) live search API.
			
 
				 
			
 
				-## [WhatsApp Chatbot](./chatbots/whatsapp_llama/whatsapp_llama2.md): Building a Llama-enabled WhatsApp Chatbot
			
 
				-This step-by-step tutorial shows how to use the [WhatsApp Business API](https://developers.facebook.com/docs/whatsapp/cloud-api/overview) to build a Llama-enabled WhatsApp chatbot.
			
 
				+## [WhatsApp Chatbot](./chatbots/whatsapp_llama/whatsapp_llama3.md): Building a Llama 3 Enabled WhatsApp Chatbot
			
 
				+This step-by-step tutorial shows how to use the [WhatsApp Business API](https://developers.facebook.com/docs/whatsapp/cloud-api/overview) to build a Llama 3 enabled WhatsApp chatbot.
			
 
				 
			
 
				-## [Messenger Chatbot](./chatbots/messenger_llama/messenger_llama2.md): Building a Llama-enabled Messenger Chatbot
			
 
				-This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama-enabled Messenger chatbot.
			
 
				+## [Messenger Chatbot](./chatbots/messenger_llama/messenger_llama3.md): Building a Llama 3 Enabled Messenger Chatbot
			
 
				+This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot.
			
 
				 
			
 
				 ### RAG Chatbot Example (running [locally](./chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb))
			
 
				-A complete example of how to build a Llama 2 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
 
				+A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
--- a/recipes/use_cases/VideoSummary.ipynb
+++ b/recipes/use_cases/VideoSummary.ipynb
--- a/recipes/use_cases/chatbots/messenger_llama/llama_messenger.py
+++ b/recipes/use_cases/chatbots/messenger_llama/llama_messenger.py
@@ -1,5 +1,5 @@
 
				 # Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				 
			
 
				 import langchain
			
 
				 from langchain.llms import Replicate
			
@@ -11,11 +11,11 @@ import requests
 
				 import json
			
 
				 
			
 
				 os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>"
			
 
				-llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
			
 
				+llama3_8b_chat = "meta/meta-llama-3-8b-instruct"
			
 
				 
			
 
				 llm = Replicate(
			
 
				-    model=llama2_13b_chat,
			
 
				-    model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
			
 
				+    model=llama3_8b_chat,
			
 
				+    model_kwargs={"temperature": 0.0, "top_p": 1, "max_new_tokens":500}
			
 
				 )
			
 
				 
			
 
				 app = Flask(__name__)
			
--- a/recipes/use_cases/chatbots/messenger_llama/messenger_llama2.md
+++ b/recipes/use_cases/chatbots/messenger_llama/messenger_llama2.md
--- a/recipes/use_cases/chatbots/whatsapp_llama/llama_chatbot.py
+++ b/recipes/use_cases/chatbots/whatsapp_llama/llama_chatbot.py
@@ -1,5 +1,5 @@
 
				 # Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				 
			
 
				 import langchain
			
 
				 from langchain.llms import Replicate
			
@@ -39,26 +39,25 @@ class WhatsAppClient:
 
				         return response.status_code
			
 
				 
			
 
				 os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>"    
			
 
				-llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
			
 
				+llama3_8b_chat = "meta/meta-llama-3-8b-instruct"
			
 
				 
			
 
				 llm = Replicate(
			
 
				-    model=llama2_13b_chat,
			
 
				-    model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
			
 
				+    model=llama3_8b_chat,
			
 
				+    model_kwargs={"temperature": 0.0, "top_p": 1, "max_new_tokens":500}
			
 
				 )
			
 
				 client = WhatsAppClient()
			
 
				 app = Flask(__name__)
			
 
				 
			
 
				 @app.route("/")
			
 
				 def hello_llama():
			
 
				-    return "<p>Hello Llama 2</p>"
			
 
				+    return "<p>Hello Llama 3</p>"
			
 
				 
			
 
				 @app.route('/msgrcvd', methods=['POST', 'GET'])
			
 
				 def msgrcvd():    
			
 
				     message = request.args.get('message')
			
 
				-    #client.send_template_message("hello_world", "en_US", "14086745477")
			
 
				     answer = llm(message)
			
 
				     print(message)
			
 
				     print(answer)
			
 
				-    client.send_text_message(llm(message), "14086745477")
			
 
				+    client.send_text_message(llm(message), "<your phone number>")
			
 
				     return message + "<p/>" + answer
			
 
				 
			
--- a/recipes/use_cases/chatbots/whatsapp_llama/whatsapp_llama2.md
+++ b/recipes/use_cases/chatbots/whatsapp_llama/whatsapp_llama2.md
--- a/recipes/use_cases/text2sql/StructuredLlama.ipynb
+++ b/recipes/use_cases/text2sql/StructuredLlama.ipynb
--- a/recipes/use_cases/text2sql/nba_roster.db
+++ b/recipes/use_cases/text2sql/nba_roster.db
--- a/scripts/spellcheck_conf/wordlist.txt
+++ b/scripts/spellcheck_conf/wordlist.txt
@@ -1308,3 +1308,5 @@ fmbench
 
				 ipykernel
			
 
				 leaderboards
			
 
				 txn
			
 
				+ollama
			
 
				+tavily