Merge pull request #3 from GreenBitAI/feature/langchain_integration

Feature/langchain integration
GreenBitAI · Oct 10, 2024 · 9750cda · 9750cda
2 parents d57572e + 973fe2d
commit 9750cda
Show file tree

Hide file tree

Showing 48 changed files with 65,226 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -47,4 +47,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 ### Updated
 
 - synchronized with the mlx-lm
-- simplified README
+- simplified README
+
+## [0.3.3] - 2024/10/10
+
+### Added
+
+- langchain integration
+- local_rag and graph_rag example
+
+### Updated
+
+- generate method to support hidden states output
diff --git a/README.md b/README.md
@@ -58,8 +58,13 @@ A high-performance HTTP API for text generation with GreenBitAI's mlx models. Im
    ```
 2. Use:
    ```shell
+   # Chat
    curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \
      -d '{"model": "default_model", "messages": [{"role": "user", "content": "Hello!"}]}'
+
+   # Chat stream
+   curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json"  \ 
+   -d '{"model": "default_model", "messages": [{"role": "user", "content": "Hello!"}], "stream": "True"}'
    ```
 
 #### Features

diff --git a/gbx_lm/fastapi_server.py b/gbx_lm/fastapi_server.py
@@ -185,7 +185,7 @@ async def stream_completion(prompt, request, model, tokenizer):
 
     stop_id_sequences = [tokenizer.encode(stop) for stop in (request.stop or [])]
 
-    for (token, _), _ in zip(
+    for (token, _, _), _ in zip(
             generate_step(
                 prompt=prompt,
                 model=model,
@@ -234,7 +234,7 @@ async def stream_chat_completion(prompt, request, model, tokenizer):
 
     stop_id_sequences = [tokenizer.encode(stop) for stop in (request.stop or [])]
 
-    for (token, _), _ in zip(
+    for (token, _, _), _ in zip(
             generate_step(
                 prompt=prompt,
                 model=model,
@@ -282,7 +282,7 @@ def generate_completion(prompt, request, model, tokenizer):
 
     stop_id_sequences = [tokenizer.encode(stop) for stop in (request.stop or [])]
 
-    for (token, _), _ in zip(
+    for (token, _, _), _ in zip(
             generate_step(
                 prompt=prompt,
                 model=model,
@@ -332,7 +332,7 @@ def generate_chat_completion(prompt, request, model, tokenizer):
 
     stop_id_sequences = [tokenizer.encode(stop) for stop in (request.stop or [])]
 
-    for (token, _), _ in zip(
+    for (token, _, _), _ in zip(
             generate_step(
                 prompt=prompt,
                 model=model,

diff --git a/gbx_lm/generate.py b/gbx_lm/generate.py
@@ -136,6 +136,17 @@ def colorprint_by_t0(s, t0):
 
 
 def load_kv_cache_from_file(kv_cache_file):
+    """
+    Load key-value cache from a specified file and organize it by layer.
+
+    Args:
+        kv_cache_file (str or None): The path to the cache file. If None, the function returns (None, None).
+
+    Returns:
+        tuple: A tuple containing:
+            - cache_history (list): A list where each index corresponds to a layer and contains a tuple of (keys, values).
+            - metadata (any): Metadata returned from the cache file.
+    """
     if kv_cache_file is None:
         return None, None
 
@@ -161,7 +172,7 @@ def do_generate(args, model: nn.Module, tokenizer: PreTrainedTokenizer, prompt:
         messages = [
             {
                 "role": "user",
-                "content": sys.stdin.read() if args.prompt == "-" else args.prompt,
+                "content": sys.stdin.read() if prompt == "-" else prompt,
             }
         ]
         prompt = tokenizer.apply_chat_template(
@@ -178,7 +189,7 @@ def do_generate(args, model: nn.Module, tokenizer: PreTrainedTokenizer, prompt:
             )
             prompt = prompt[test_prompt.index("<query>") :]
     else:
-        prompt = args.prompt
+        prompt = prompt
 
     if args.colorize and not args.verbose:
         raise ValueError("Cannot use --colorize with --verbose=False")

diff --git a/gbx_lm/langchain/README.md b/gbx_lm/langchain/README.md
@@ -0,0 +1,61 @@
+# GBX Langchain Demos
+
+## Overview
+
+GBX Langchain Demos showcase the integration of GBX language models with the Langchain framework, enabling powerful and flexible natural language processing capabilities.
+
+## Installation
+
+### Step 1: Install the gbx_lm Package
+
+```bash
+pip install gbx-lm
+```
+
+### Step 2: Install Langchain Package
+
+```bash
+pip install langchain-core
+```
+
+Ensure your system has Python3 and pip installed before proceeding.
+
+## Usage
+
+### Basic Example
+
+Here's a basic example of how to use the GBX Langchain integration:
+
+```python
+from gbx_lm.langchain import ChatGBX
+from gbx_lm.langchain import GBXPipeline
+
+llm = GBXPipeline.from_model_id(
+    model_id="GreenBitAI/Llama-3-8B-layer-mix-bpw-4.0-mlx",
+)
+chat = ChatGBX(llm=llm)
+```
+
+### Advanced Usage
+
+#### Using `from_model_id` with custom parameters:
+
+```python
+from gbx_lm.langchain import GBXPipeline
+
+pipe = GBXPipeline.from_model_id(
+    model_id="GreenBitAI/Llama-3-8B-layer-mix-bpw-4.0-mlx",
+    pipeline_kwargs={"max_tokens": 100, "temp": 0.7},
+)
+```
+
+#### Passing model and tokenizer directly:
+
+```python
+from gbx_lm.langchain import GBXPipeline
+from gbx_lm import load
+
+model_id = "GreenBitAI/Llama-3-8B-layer-mix-bpw-4.0-mlx"
+model, tokenizer = load(model_id)
+pipe = GBXPipeline(model=model, tokenizer=tokenizer)
+```
diff --git a/gbx_lm/langchain/__init__.py b/gbx_lm/langchain/__init__.py
@@ -0,0 +1,3 @@
+from .gbx_pipeline import GBXPipeline
+from .chat_gbx import ChatGBX
+from .graph_transformer import SimpleGraphTransformer