介绍

到目前为止，我们利用openai的api做了6个任务，想统计下每个任务的花费情况，现做简单统计和绘图

一、收集缓存json数据，然后收集需要数据，转换成dataframe格式

for json_file in json_files:
    json_data = json.load(open(json_file, "r", encoding="utf-8"))
    src_data.append(json_data)
data = []
cnt = 0
for one in tqdm(src_data):
    prompt_prefix = one.get("input_prompt")
    if not prompt_prefix:
        continue
    cnt += 1
    prompt_prefix_len = len(prompt_prefix)
    question = one["input_text"]
    question_len = len(question)
    result = one["result"]
    model = result["model"]
    prompt_token_num = result["usage"]["prompt_tokens"]
    anwser_token_num = result["usage"]["completion_tokens"]
    total_token_num = result["usage"]["total_tokens"]
    anwser = result["choices"][0]["message"]["content"]
    anwser_len = len(anwser)
    one_data = {
        "prompt_prefix": prompt_prefix,
        "prompt_prefix_len": prompt_prefix_len,
        "question": question,
        "question_len": question_len,
        "model": model,
        "prompt_token_num": prompt_token_num,
        "anwser_token_num": anwser_token_num,
        "total_token_num": total_token_num,
        "anwser": anwser,
        "anwser_len": anwser_len,
        "total_length": prompt_prefix_len + question_len + anwser_len, #单词的总长度
    }
    data.append(one_data)
# 转换成Dataframe
df = pd.DataFrame(data)
print(f"共收集到{len(df)}条数据，列名是{df.columns}")

共收集到9624条数据

首先对每个任务的问题数量进行统计和绘图

def plot_prompt_prefix_freq(df_data):
    # 按input_prompt的问题进行统计
    #对相似的问题进行合并
    # 对prompt_prefix进行修改，如果出现了包含某些关键字的问题，那么就更改prompt_prefix为统一的关键字
    df_prompt_prefix_cnt = df_data['prompt_type'].value_counts().sort_values(ascending=False)
    print(f"共有{len(df_prompt_prefix_cnt)}个问题,列名分别是: {df_prompt_prefix_cnt.index}")
    print(f"去掉结果为1个的的问题")
    df_prompt_prefix_cnt = df_prompt_prefix_cnt[df_prompt_prefix_cnt > 10]
    print(f"剩余共有{len(df_prompt_prefix_cnt)}个问题,列名分别是: {df_prompt_prefix_cnt.index}")
    df_counts = df_prompt_prefix_cnt.reset_index()
    df_counts.columns = ['prompt_prefix', 'count']
    fig, ax = plt.subplots(figsize=(20, 10), dpi=100)
    df_counts.plot.bar(x='prompt_prefix', y='count', ax=ax)
    plt.xlabel('prompt_prefix')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.title('问题分布数量分别')
    # 保存到图片
    png_file = "/Users/admin/tmp/prompt_prefix_freq.png"
    plt.savefig(png_file)
    print(f"保存到文件{png_file}")

对总的问题数，问题+答案长度和token数量和价格进行绘图

def plot_prompt_token_num2(df_data):
    """
    根据prompt类型，统计总的问题长度，prompt长度
    Args:
        df_data ():
    Returns:
    """
    df_prompt_prefix_cnt = df_data['prompt_type'].value_counts().sort_values(ascending=False)
    df_counts = df_prompt_prefix_cnt.reset_index()
    df_counts.columns = ['prompt_type', 'count']
    # 每个token的人民币价格
    price_unit = 0.014/1000
    #对prompt_type进行groupby，根据groupby的结果对prompt_prefix_len求和，question_len求和，anwser_len求和，prompt_token_num	anwser_token_num	total_token_num
    df_sum = df_data.groupby("prompt_type").agg({"total_length": "sum", "total_token_num": "sum"})
    df_sum = df_sum.reset_index()
    df_sum = df_sum.sort_values(by="total_token_num", ascending=False)
    # 加上df_counts中的count列
    df_sum = pd.merge(df_sum, df_counts, on="prompt_type", how="left")
    # 调换列的顺序，让count列在最前面，去掉"prompt_token_num", "anwser_token_num",
    df_sum = df_sum[["prompt_type", "count", "total_length", "total_token_num"]]
    # 计算总的价格，根据total_token_num乘以price_unit
    df_sum["total_price"] = df_sum["total_token_num"] * price_unit
    print(f"形状是{df_sum.shape}")
    # 重命名列名, 去掉："提示token计数", "答案token计数",
    df_sum.columns = ["prompt_type", "问题个数", "单词总数", "总token数","总价格人民币"]
    df_list = df_sum.to_dict(orient="list")
    prompt_types = df_list.pop("prompt_type")
    # 其余的列转换成字典，key是列名，值是列表，list
    # 绘图
    x = np.arange(len(prompt_types))  # the label locations
    width = 0.1  # the width of the bars
    multiplier = 0

    fig, ax = plt.subplots(layout='constrained',figsize=(20, 10), dpi=100)

    for attribute, measurement in df_list.items():
        offset = width * multiplier
        rects = ax.bar(x + offset, measurement, width, label=attribute)
        ax.bar_label(rects, padding=5)
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('数量、长度、价格')
    # 设置标题
    ax.set_title('问题类型和价格，问题长度，token数量的关系')
    ax.set_xticks(x + width, prompt_types)
    ax.legend(loc='upper left', ncols=5)
    # 上限200万
    ax.set_ylim(0, 2000000)
    png_file = "/Users/admin/tmp/question_prices2.png"
    plt.savefig(png_file)
    print(f"保存到文件{png_file}")

总结

我们的任务都是中文的任务，根据单词总数和总的token数量的关系，我们可以算出token和单词总数的占比

1 2	`1.32419/1.70707=0.77570 1.25557/1.64893=0.76144`

所以1000个token大概相当于中文770个单词左右，由于我们的在统计中没有对少量英文进行切词，所以如果纯中文大概还会更多一些，相当于800个中文单词左右。

openai的价格估计

https://johnson7788.github.io/2023/05/08/openai%E7%9A%84%E4%BB%B7%E6%A0%BC%E4%BC%B0%E8%AE%A1/

作者

Johnson

发布于

2023年5月8日

许可协议

sentence_transformers联网报错问题上一篇

利用chatgpt接口做超细粒度情感分析下一篇