[ENH] Restructure the project.

This commit is contained in:
herobrine19
2023-05-22 02:39:04 +08:00
parent f329433cad
commit 46a9cbc21e
22 changed files with 755 additions and 36 deletions

View File

@ -1,8 +0,0 @@
[
{
"content": "中华人民共和国最高人民法院 再 审 决 定 书2022最高法刑申136号 原审被告人张某某犯挪用资金罪和伪造、变造国家机关公文罪一案山西省运城市盐湖区人民法院于2012年5月2日以2012运盐刑初字第69号刑事判决认定张克云犯贪污罪判处有期徒刑十二年犯伪造、变造国家机关公文罪判处有期徒刑三年决定执行有期徒刑十三年。宣判后张克云不服提出上诉。山西省运城市中级人民法院于2012年11月12日以2012运中刑二终字第125号刑事裁定驳回上诉维持原判。裁判生效后张克云不服提出申诉。运城市中级人民法院于2013年1月7日以2013运中刑申字第3号驳回申诉通知驳回其申诉。山西省高级人民法院于2017年7月13日以2013晋刑监字第8号再审决定提审本案并于2019年12月24日以2017晋刑再第2号刑事判决认定张克云犯挪用资金罪判处有期徒刑七年六个月与原判伪造、变造国家机关公文罪被判处的有期徒刑三年数罪并罚决定执行有期徒刑十年。张克云仍不服以原审认定事实错误其作为学校董事长、全资投资人有权决定学校相关款项用途学校仍欠其债务个人账户用于学校经费开支没有挪用资金的动机和行为不构成挪用资金罪等为由向本院提出申诉。本院经审查认为原审生效裁判对挪用资金罪定罪量刑的证据不确实、不充分依法应当予以排除。依照《中华人民共和国刑事诉讼法》第二百五十三条第二项、第二百五十四条第二款、第二百五十五条的规定决定如下指令河南省高级人民法院对本案进行再审。二二二年十二月二十九日"
},
{
"content":"中华人民共和国最高人民法院 驳 回 申 诉 通 知 书2022最高法刑申122号 袁某银、袁某财你们因原审被告人袁德银故意伤害一案对江苏省南京市溧水区人民法院2014溧刑初字第268号刑事判决、南京市中级人民法院2015宁刑终字第433号刑事裁定不服以被害人朱宽荣住院期间的136678号报告并未显示其左侧4、5、6、7、8肋骨骨折出院记录及137470号、143006号报告均系伪造江苏省高级人民法院2019苏刑申172号驳回申诉通知书对137470号报告的形成时间认定错误为由向本院提出申诉请求撤销原判依法重新审理本案。本院依法组成合议庭认真审查后认为原审认定原审被告人袁德银因邻里纠纷殴打被害人朱宽荣致其左胸多发肋骨骨折构成轻伤二级其行为构成故意伤害罪并无不当。关于你们提出的原审认定被害人朱宽荣轻伤二级的证据系伪造的申诉理由。首先根据你们提供的136678号报告朱宽荣于2015年2月12日入院时经检查被诊断为左侧多发肋骨骨折该份报告虽未明确骨折的具体位置但与出院记录记载的“左侧多发肋骨骨折4、5、6、7、8”并不矛盾。根据《人体损伤程度鉴定标准》的规定肋骨骨折2处以上即构成轻伤二级故鉴定意见认定朱宽荣的损伤构成轻伤二级并无不当。其次你们主张出院记录、137470号和143006号报告系伪造但并未提供证据证实。即使排除这两份报告也不影响对本案事实的认定。再次我院对申诉进行复查的对象是已经发生法律效力的判决、裁定本案的生效裁判是2014溧刑初字第268号刑事判决和2015宁刑终字第433号刑事裁定江苏省高级人民法院作出的驳回申诉通知书不属于我院复查的范围。综上本院认为你们的申诉不符合《中华人民共和国刑事诉讼法》第二百五十三条规定的应当重新审判的情形予以驳回。望你们尊重人民法院的生效裁判自觉服判息诉。特此通知。二二二年十二月十四日"
}
]

View File

@ -1,10 +0,0 @@
[
{
"instruction": "酒驾撞人要判多久?",
"input": "",
"output": "《刑法》第一百三十三条规定:违反交通运输管理法规,因而发生重大事故,致人重伤、死广或者使公私财产遭受重大损失的,处三年以下有期徒刑或者拘役;交通运输肇事后逃逸,致人重伤的,处三年以上七年以下有期徒刑。交通肇事后逃逸致人死亡的,处七年以上有期徒刑。"
},
{"instruction":"偷狗被派出所抓到,会怎么处理?",
"input":"",
"output":"取决于偷盗的狗的价值,可能按盗窃罪的罪名处理。如果价值达到犯罪标准,就构成盗窃罪,要承担刑事责任;如果不到犯罪标准,就是治安处罚、罚款或者拘留治安处罚这会涉嫌构成盗窃。如果不到一千元,则不会构成犯罪。如果超过一千元,则可能会是构成犯罪的。"}
]

View File

View File

@ -1,17 +0,0 @@
#!/bin/bash
WORLD_SIZE=8 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 --master_port=1234 finetune.py \
--base_model 'minlik/chinese-llama-7b-merged' \
--data_path '' \
--output_dir './outputs/LawGPT' \
--prompt_template_name 'law_template' \
--micro_batch_size 16 \
--batch_size 128 \
--num_epochs 3 \
--val_set_size 10000 \
--lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' \
--lora_r 16 \
--lora_alpha 32 \
--learning_rate 3e-4 \
--cutoff_len 512 \
--resume_from_checkpoint './outputs/LawGPT' \

View File

@ -1,7 +0,0 @@
CUDA_VISIBLE_DEVICES=1 python generate.py \
--load_8bit \
--base_model 'minlik/chinese-llama-7b-merged' \
--lora_weights 'entity303/lawgpt-lora-7b' \
--prompt_template 'law_template' \
--share_gradio

View File

@ -1,20 +0,0 @@
#!/bin/bash
WORLD_SIZE=8 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 --master_port=1235 train_lora.py \
--base_model '../models/base_models/chinese_llama_7b' \
--data_path '' \
--output_dir '../models/lora_weights' \
--batch_size 128 \
--micro_batch_size 8 \
--num_epochs 1 \
--learning_rate 0.0003 \
--cutoff_len 1024 \
--val_set_size 0 \
--lora_r 16 \
--lora_alpha 32 \
--lora_dropout 0.05 \
--lora_target_modules '[q_proj, v_proj, k_proj, o_proj]' \
--train_on_inputs True \
--add_eos_token True \
--group_by_length True \
--resume_from_checkpoint '../models/lora_weights'

View File

@ -1,6 +0,0 @@
{
"description": "Template used by Law Instruction Tuning",
"prompt_input": "下面是一个问题,运用法学知识来正确回答提问.\n### 问题:\n{instruction}\n### 回答:\n",
"prompt_no_input": "下面是一个问题,运用法学知识来正确回答提问.\n### 问题:\n{instruction}\n### 回答:\n",
"response_split": "### 回答:"
}

View File

View File

@ -1,75 +0,0 @@
"""
Helpers to support streaming generate output.
Borrowed from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/callbacks.py
"""
import gc
import traceback
from queue import Queue
from threading import Thread
import torch
import transformers
class Stream(transformers.StoppingCriteria):
def __init__(self, callback_func=None):
self.callback_func = callback_func
def __call__(self, input_ids, scores) -> bool:
if self.callback_func is not None:
self.callback_func(input_ids[0])
return False
class Iteratorize:
"""
Transforms a function that takes a callback
into a lazy iterator (generator).
"""
def __init__(self, func, kwargs={}, callback=None):
self.mfunc = func
self.c_callback = callback
self.q = Queue()
self.sentinel = object()
self.kwargs = kwargs
self.stop_now = False
def _callback(val):
if self.stop_now:
raise ValueError
self.q.put(val)
def gentask():
try:
ret = self.mfunc(callback=_callback, **self.kwargs)
except ValueError:
pass
except:
traceback.print_exc()
pass
self.q.put(self.sentinel)
if self.c_callback:
self.c_callback(ret)
self.thread = Thread(target=gentask)
self.thread.start()
def __iter__(self):
return self
def __next__(self):
obj = self.q.get(True, None)
if obj is self.sentinel:
raise StopIteration
else:
return obj
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop_now = True

View File

@ -1,196 +0,0 @@
import math
import os
import sys
import fire
from tqdm import tqdm
import pandas as pd
import torch
import transformers
from peft import PeftModel
import datasets
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
from utils.callbacks import Iteratorize, Stream
from utils.prompter import Prompter
device = "cuda"
def main(
load_8bit: bool = True,
base_model: str = "decapoda-research/llama-7b-hf",
lora_weights: str = "./lora-alpaca",
data_path: str = "./data",
output_path: str = "./output",
eval_rate: float = 0.1,
batch_size: int = 32,
# The prompt template to use, will default to alpaca.
prompt_template: str = "alpaca",
):
base_model = base_model or os.environ.get("BASE_MODEL", "")
assert (base_model), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
prompter = Prompter(prompt_template)
tokenizer = LlamaTokenizer.from_pretrained(base_model)
if device == "cuda":
model = LlamaForCausalLM.from_pretrained(
base_model,
load_in_8bit=load_8bit,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(
model,
lora_weights,
torch_dtype=torch.float16,
)
# unwind broken decapoda-research config
model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2
if not load_8bit:
model.half() # seems to fix bugs for some users.
model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
def evaluate_one(
instruction,
input=None,
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=2,
max_new_tokens=128,
**kwargs,
):
prompt = prompter.generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
**kwargs,
)
# Without streaming
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
s = generation_output.sequences[0]
output = tokenizer.decode(s, skip_special_tokens=True)
return prompter.get_response(output)
def evaluate_all():
# data = datasets.load_dataset("json", data_files=data_path)
# data = data["train"]
# df = data.to_pandas()
df = pd.read_json(data_path, orient='records')
print(df.info())
# 计算准确率
correct = 0
total = 0
total_step = len(df)
pbar = tqdm(total=total_step, unit='batch')
error = []
for i in range(total_step):
instruction = df['instruction'].iloc[i]
input = df['input'].iloc[i]
label = df['output'].iloc[i]
pred = evaluate_one(instruction=instruction, input=input)
if pred == label:
correct += 1
else:
error.append((label, pred))
total += 1
acc = correct / total
# 更新进度条
# Update the progress bar
pbar.set_description(
f"Testing: Sample [{total}/{total_step}] Acc: {acc :.4f}")
pbar.update(1)
for e in error:
print(e)
def evaluate_by_batch(
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=1,
max_new_tokens=32
):
df = pd.read_json(data_path, orient='records')
# df = df.sample(frac=eval_rate).reset_index(drop=True)
df['prompt'] = df.apply(lambda x: prompter.generate_prompt(
x['instruction'], x['input']), axis=1)
tokenizer.padding_side = "left" # Allow batched inference
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams
)
outputs = []
total = 0
total_step = math.ceil(len(df) / batch_size)
pbar = tqdm(total=total_step, unit='batch')
# 计算准确率
with torch.no_grad():
for i in range(total_step):
batch = df.iloc[i*batch_size:(i+1)*batch_size]
inputs = tokenizer(batch['prompt'].tolist(), return_tensors="pt", padding=True)[
'input_ids'].to(device)
generation_outputs = model.generate(
input_ids=inputs,
generation_config=generation_config,
max_new_tokens=max_new_tokens,
pad_token_id=tokenizer.pad_token_id
)
for g in generation_outputs:
decoded_item = tokenizer.decode(
g, skip_special_tokens=True)
try:
output = prompter.get_response(decoded_item)
except:
output = decoded_item
outputs.append(output)
total += 1
# 更新进度条
pbar.set_description(f"Testing: Sample [{total}/{len(df)}] ")
pbar.update(1)
df['pred'] = outputs
df['pred'].to_csv(output_path, index=False)
evaluate_by_batch()
if __name__ == "__main__":
# fire.Fire(main)
import yaml
dataset_param = sys.argv[1]
with open("./configs/evaluate_params.yaml", "r") as stream:
# try:
params = yaml.safe_load(stream)
print('=' * 80)
print(params[dataset_param])
print('=' * 80)
# fire.Fire(train)
main(**params[dataset_param])

View File

@ -1,51 +0,0 @@
import os
import torch
import transformers
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: F402
BASE_MODEL = os.environ.get("BASE_MODEL", None)
assert (
BASE_MODEL
), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=huggyllama/llama-7b`" # noqa: E501
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
base_model = LlamaForCausalLM.from_pretrained(
BASE_MODEL,
load_in_8bit=False,
torch_dtype=torch.float16,
device_map={"": "cpu"},
)
first_weight = base_model.model.layers[0].self_attn.q_proj.weight
first_weight_old = first_weight.clone()
lora_model = PeftModel.from_pretrained(
base_model,
"../outputs/lora-llama-clm-e2",
device_map={"": "cpu"},
torch_dtype=torch.float16,
)
lora_weight = lora_model.base_model.model.model.layers[0].self_attn.q_proj.weight
assert torch.allclose(first_weight_old, first_weight)
# merge weights - new merging method from peft
lora_model = lora_model.merge_and_unload()
lora_model.train(False)
# did we do anything?
assert not torch.allclose(first_weight_old, first_weight)
lora_model_sd = lora_model.state_dict()
deloreanized_sd = {
k.replace("base_model.model.", ""): v
for k, v in lora_model_sd.items()
if "lora" not in k
}
LlamaForCausalLM.save_pretrained(base_model, '../models/legal-base-7b', state_dict=deloreanized_sd, max_shard_size="400MB")

View File

@ -1,51 +0,0 @@
"""
A dedicated helper to manage templates and prompt building.
"""
import json
import os.path as osp
from typing import Union
class Prompter(object):
__slots__ = ("template", "_verbose")
def __init__(self, template_name: str = "", verbose: bool = False):
self._verbose = verbose
if not template_name:
# Enforce the default here, so the constructor can be called with '' and will not break.
template_name = "alpaca"
file_name = osp.join("templates", f"{template_name}.json")
if not osp.exists(file_name):
raise ValueError(f"Can't read {file_name}")
with open(file_name) as fp:
self.template = json.load(fp)
if self._verbose:
print(
f"Using prompt template {template_name}: {self.template['description']}"
)
def generate_prompt(
self,
instruction: str,
input: Union[None, str] = None,
label: Union[None, str] = None,
) -> str:
# returns the full prompt from instruction and optional input
# if a label (=response, =output) is provided, it's also appended.
if input:
res = self.template["prompt_input"].format(
instruction=instruction, input=input
)
else:
res = self.template["prompt_no_input"].format(
instruction=instruction
)
if label:
res = f"{res}{label}"
if self._verbose:
print(res)
return res
def get_response(self, output: str) -> str:
return output.split(self.template["response_split"])[1].strip()