add airllm

This commit is contained in:
Yu Li
2023-11-17 15:12:39 -06:00
parent 6770bdcf9a
commit 662d1d6e35
7 changed files with 701 additions and 0 deletions

4
.gitignore vendored
View File

@@ -1,3 +1,7 @@
.idea
.ipynb_checkpoints
.DS_Store
airllm.egg-info
build
dist
__pycache__

201
air_llm/LICENSE Normal file
View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

66
air_llm/README.md Normal file
View File

@@ -0,0 +1,66 @@
AirLLM optimizes inference memory usage, allowing 70B large language models to run inference on a single 4GB GPU card. No quantization, distillation, pruning or other model compression techniques that would result in degraded model performance are needed.
AirLLM优化inference内存4GB单卡GPU可以运行70B大语言模型推理。不需要任何损失模型性能的量化和蒸馏剪枝等模型压缩。
## Quickstart
### install package
First, install airllm pip package.
首先安装airllm包。
```bash
pip install airllm
```
如果找不到package可能是因为默认的镜像问题。可以尝试制定原始镜像
```bash
pip install -i https://pypi.org/simple/ airllm
```
### Inference
Then, initialize AirLLMLlama2, pass in the huggingface repo ID of the model being used, or the local path, and inference can be performed similar to a regular transformer model.
然后初始化AirLLMLlama2传入所使用模型的huggingface repo ID或者本地路径即可类似于普通的transformer模型进行推理。
```python
from airllm import AirLLMLlama2
MAX_LENGTH = 128
# could use hugging face model repo id:
model = AirLLMLlama2("garage-bAInd/Platypus2-70B-instruct")
# or use model's local path...
#model = AirLLMLlama2("/home/ubuntu/.cache/huggingface/hub/models--garage-bAInd--Platypus2-70B-instruct/snapshots/b585e74bcaae02e52665d9ac6d23f4d0dbc81a0f")
input_text = [
'What is the capital of United States?',
#'I like',
]
input_tokens = model.tokenizer(input_text,
return_tensors="pt",
return_attention_mask=False,
truncation=True,
max_length=MAX_LENGTH,
padding=True)
generation_output = model.generate(
input_tokens['input_ids'].cuda(),
max_new_tokens=2,
use_cache=True,
return_dict_in_generate=True)
output = model.tokenizer.decode(generation_output.sequences[0])
print(output)
```
Note: During inference, the original model will first be decomposed and saved layer-wise. Please ensure there is sufficient disk space in the huggingface cache directory.
注意推理过程会首先将原始模型按层分拆转存。请保证huggingface cache目录有足够的磁盘空间。

View File

@@ -0,0 +1,2 @@
from .airllm import AirLLMLlama2
from .airllm import split_and_save_layers

368
air_llm/airllm/airllm.py Normal file
View File

@@ -0,0 +1,368 @@
import gc
import json
import os
from typing import List, Optional, Tuple, Union
import ctypes
import shutil
from tqdm import tqdm
from pathlib import Path
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel, GenerationMixin, LlamaForCausalLM, GenerationConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
from accelerate import init_empty_weights
from accelerate.utils.modeling import set_module_tensor_to_device
from safetensors.torch import load_file, save_file
from optimum.bettertransformer import BetterTransformer
import huggingface_hub
# Function to clean RAM & vRAM
def clean_memory():
gc.collect()
ctypes.CDLL("libc.so.6").malloc_trim(0)
torch.cuda.empty_cache()
def load_layer(local_path, layer_name):
layer_state_dict = load_file(Path(local_path) / (layer_name + ".safetensors"), device="cpu")
return layer_state_dict
def split_and_save_layers(checkpoint_path, splitted_model_dir_name='splitted_model'):
"""
Save the all layers of a model sharded checkpoint using safetensors.
"""
checkpoint_path = Path(checkpoint_path)
total, used, free = shutil.disk_usage(checkpoint_path)
Llama2_70B_size = 134720680
if free/1024 < Llama2_70B_size:
print(f"WARNING: free space in the saving path {checkpoint_path / splitted_model_dir_name} seems small: {free/1024/1024/1024:02f}GB, please make sure you have enough space to save the splitted model")
with open(checkpoint_path / 'pytorch_model.bin.index.json', 'rb') as f:
index = json.load(f)['weight_map']
n_layers = len(set([int(k.split('.')[2]) for k in index.keys() if 'model.layers' in k]))
layers = ['model.embed_tokens.'] + [f'model.layers.{i}.' for i in range(n_layers)] + ['model.norm.', 'lm_head.']
shard = 0
n_shards = len(set(index.values()))
state_dict = {}
if not os.path.exists(checkpoint_path / splitted_model_dir_name):
os.makedirs(checkpoint_path / splitted_model_dir_name)
for layer in tqdm(layers):
# Optionnally load next shard
shards = [int(v.split('-')[1]) for k, v in index.items() if k.startswith(layer)]
if max(shards) > shard:
shard += 1
print(f'Loading shard {shard}/{n_shards}')
state_dict.update(torch.load(checkpoint_path / f'pytorch_model-000{shard:02d}-of-000{n_shards:02d}.bin',
map_location='cpu'))
# Get layer state dict
layer_state_dict = dict([(k, v) for k, v in state_dict.items() if k.startswith(layer)])
# Save layer state dict as using safetensors
save_file(layer_state_dict, checkpoint_path / splitted_model_dir_name / (layer + 'safetensors'))
print(f"saved as: {checkpoint_path / splitted_model_dir_name / (layer + 'safetensors')}")
# Free memory
for k in layer_state_dict.keys():
del state_dict[k]
del layer_state_dict
gc.collect()
return str(checkpoint_path / splitted_model_dir_name)
def find_or_create_local_splitted_path(model_local_path_or_repo_id):
# try as splitted path first...
if os.path.exists(Path(model_local_path_or_repo_id) / 'splitted_model'):
return Path(model_local_path_or_repo_id) / 'splitted_model'
# try local model path
if os.path.exists(model_local_path_or_repo_id):
if os.path.exists(Path(model_local_path_or_repo_id) / 'pytorch_model.bin.index.json'):
return split_and_save_layers(model_local_path_or_repo_id)
else:
print(
f"Found local directory in {model_local_path_or_repo_id}, but didn't find downloaded model. Try using {model_local_path_or_repo_id} as a HF repo...")
# it should be a repo id at this point...
hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id)
assert os.path.exists(Path(
hf_cache_path) / 'pytorch_model.bin.index.json'), f"{hf_cache_path}/pytorch_model.bin.index.json should exists."
if os.path.exists(Path(hf_cache_path) / 'splitted_model'):
return Path(hf_cache_path) / 'splitted_model'
else:
return split_and_save_layers(hf_cache_path)
class AirLLMLlama2(GenerationMixin):
def __init__(self, model_local_path_or_repo_id, device="cuda:0", dtype=torch.float16, max_seq_len=512):
"""
Sharded version of LlamaForCausalLM : the model is splitted into layer shards to reduce GPU memory usage.
During the forward pass, the inputs are processed layer by layer, and the GPU memory is freed after each layer.
To avoid loading the layers multiple times, we could save all the intermediate activations in RAM, but
as Kaggle accelerators have more GPU memory than CPU, we simply batch the inputs and keep them on the GPU.
Parameters
----------
checkpoint_path : str or Path
path to the checkpoint
device : str, optional
device, by default "cuda:0"
dtype : torch.dtype, optional
dtype, by default torch.float16
"""
# Save parameters
self.checkpoint_path = find_or_create_local_splitted_path(model_local_path_or_repo_id)
self.running_device = device
self.device = torch.device(self.running_device)
self.running_dtype = dtype
self.dtype = self.running_dtype
# Create model
self.config = AutoConfig.from_pretrained(self.checkpoint_path.parent)
self.generation_config = GenerationConfig.from_pretrained(self.checkpoint_path.parent)
#print(f"using generation_config: {self.generation_config}")
self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_path.parent)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.padding_side = "right"
self.init_model()
self.layer_names = ["model.embed_tokens"] + [f"model.layers.{i}" for i in
range(len(self.model.model.layers))] + ["model.norm", "lm_head"]
self.max_seq_len = max_seq_len
self.main_input_name = "input_ids"
def init_model(self):
# Load meta model (no memory used)
with init_empty_weights():
self.model = AutoModelForCausalLM.from_config(self.config)
self.model.eval()
self.model = BetterTransformer.transform(self.model) # enable flash attention
self.model.tie_weights()
self.layers = [self.model.model.embed_tokens] + list(self.model.model.layers) + [self.model.model.norm,
self.model.lm_head]
# Move buffers to device (not that much GPU memory used)
for buffer_name, buffer in self.model.named_buffers():
set_module_tensor_to_device(self.model, buffer_name, self.running_device, value=buffer,
dtype=self.running_dtype)
def load_layer_to_cpu(self, layer_name):
state_dict = load_layer(self.checkpoint_path, layer_name)
return state_dict
def move_layer_to_device(self, state_dict):
for param_name, param in state_dict.items():
assert param.dtype != torch.int8, "int8 not supported (need to add fp16_statistics)"
set_module_tensor_to_device(self.model, param_name, self.running_device, value=param,
dtype=self.running_dtype)
# make GenerationMixin happy
def can_generate(self):
return True
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# Some generation methods already pass only the last input ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# Default to old behavior: keep only final ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1]:]
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
# Reboot the model to make sure buffers are loaded and memory is clean
del self.model
clean_memory()
self.init_model()
batch = [input_ids_unit.to(self.running_device).unsqueeze(0) for input_ids_unit in input_ids]
n_seq = len(batch[0])
batch_eos = [(input_ids_unit != self.tokenizer.pad_token_id).sum(0) - 1 for input_ids_unit in input_ids]
# Create attention mask for the largest input, and position ids to use KV cache
attention_mask = torch.ones(self.max_seq_len, self.max_seq_len)
attention_mask = attention_mask.triu(diagonal=1)[None, None, ...] == 0
attention_mask = attention_mask.to(self.running_device)
position_ids = torch.arange(self.max_seq_len, dtype=torch.long, device=self.running_device)[None, :]
kv_cache_list = [] if use_cache else None
if use_cache:
for x in self.layers:
kv_cache_list.append(([], []))
all_hidden_states = [] * len(self.layers) if output_hidden_states else None
all_self_attns = [] * len(self.layers) if output_attentions else None
with torch.inference_mode():
for i, (layer_name, layer) in tqdm(enumerate(zip(self.layer_names, self.layers)), desc=self.running_device,
total=len(self.layers)):
state_dict = self.load_layer_to_cpu(layer_name)
self.move_layer_to_device(state_dict)
# Run layer
for j, seq in enumerate(batch):
if layer_name == "model.embed_tokens":
batch[j] = layer(seq)
elif layer_name == "model.norm":
batch[j] = layer(seq[torch.arange(n_seq), batch_eos[j]][:, None])
if output_attentions:
all_hidden_states[i].append(batch[j])
elif layer_name == "lm_head":
batch[j] = layer(seq).float()
else:
if output_attentions:
all_hidden_states[i].append(new_seq)
if past_key_values is not None:
# join past kv
k_cache, v_cache = past_key_values[i - 1]
len_p = past_key_values[0][0].shape[2]
len_s = seq.shape[1]
pos = position_ids[:, len_p:len_p + len_s]
attn = attention_mask[:, :, -len_s:, -len_p - len_s:]
kv_cache = (k_cache,
v_cache,
)
layer_outputs = layer(seq,
use_cache=True,
output_attentions=output_attentions,
past_key_value=kv_cache,
position_ids=pos,
attention_mask=attn)
new_seq = layer_outputs[0]
if output_attentions:
all_self_attns[i].append(layer_outputs[1])
if use_cache:
(k_cache, v_cache) = layer_outputs[2 if output_attentions else 1]
kv_cache_list[i][0].append(k_cache)
kv_cache_list[i][1].append(v_cache)
else:
len_seq = seq.shape[1]
if not use_cache:
new_seq = layer(seq,
attention_mask=attention_mask[:, :, -len_seq:, -len_seq:])[0]
else:
new_seq, (k_cache, v_cache) = layer(seq,
use_cache=True,
attention_mask=attention_mask[:, :, -len_seq:,
-len_seq:])
kv_cache_list[i][0].append(k_cache)
kv_cache_list[i][1].append(v_cache)
# print(f"k_cache size: {k_cache.shape}")
# print(f"k_cache sizes: {[len(x[1]) for x in kv_cache_list]}")
batch[j] = new_seq
if output_hidden_states:
all_hidden_states += (torch.cat(batch, 0),)
# Remove previous layer from memory (including buffers)
layer.to("meta")
clean_memory() # proposed by CPMP
logits = torch.cat(batch, 0)
if use_cache:
kv_cache_list = kv_cache_list[1:-2]
for i in range(len(kv_cache_list)):
# print(f"{i} - {kv_cache_list[i][0].shape}")
kv_cache_list[i] = (torch.cat(kv_cache_list[i][0], 0), torch.cat(kv_cache_list[i][1], 0))
print(f"returning kvcache size: {kv_cache_list[0][0].shape}")
if output_attentions:
all_self_attns = all_self_attns[0:-2]
for i in range(len(all_self_attns)):
all_self_attns[i] = torch.cat(all_self_attns[i], 0)
if output_hidden_states:
all_hidden_states = all_hidden_states[0:-2]
for i in range(len(all_hidden_states)):
all_hidden_states[i] = torch.cat(all_hidden_states[i], 0)
if not return_dict:
return tuple(v for v in [logits,
tuple(kv_cache_list) if kv_cache_list is not None else None,
tuple(all_hidden_states) if all_hidden_states is not None else None,
tuple(all_self_attns) if all_self_attns is not None else None] if v is not None)
return CausalLMOutputWithPast(
loss=None,
logits=logits,
past_key_values=tuple(kv_cache_list) if kv_cache_list is not None else None,
hidden_states=tuple(all_hidden_states) if all_hidden_states is not None else None,
attentions=tuple(all_self_attns) if all_hidden_states is not None else None,
)

View File

@@ -0,0 +1,30 @@
from airllm import AirLLMLlama2
MAX_LENGTH = 128
# could use hugging face model repo id:
model = AirLLMLlama2("garage-bAInd/Platypus2-70B-instruct")
# or use model's local path...
#model = AirLLMLlama2("/home/ubuntu/.cache/huggingface/hub/models--garage-bAInd--Platypus2-70B-instruct/snapshots/b585e74bcaae02e52665d9ac6d23f4d0dbc81a0f")
input_text = [
'What is the capital of United States?',
#'I like',
]
input_tokens = model.tokenizer(input_text,
return_tensors="pt",
return_attention_mask=False,
truncation=True,
max_length=MAX_LENGTH,
padding=True)
generation_output = model.generate(
input_tokens['input_ids'].cuda(),
max_new_tokens=2,
use_cache=True,
return_dict_in_generate=True)
output = model.tokenizer.decode(generation_output.sequences[0])
print(output)

30
air_llm/setup.py Normal file
View File

@@ -0,0 +1,30 @@
import setuptools
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
name="airllm",
version="0.9.3",
author="Gavin Li",
author_email="gavinli@animaai.cloud",
description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/lyogavin/Anima/tree/main/air_llm",
packages=setuptools.find_packages(),
install_requires=[ # I get to this in a second
'tqdm',
'torch',
'transformers',
'accelerate',
'safetensors',
'optimum',
'huggingface_hub'
],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
)