我有以下 Python 代码,以及一些 GGML 模型。目标是使用 LLM 模型而不是句子转换器来总结我的所有 txt 文件。第一部分检查文本间距并将其转换为连续的行而不是段落。第二部分采用 LLM 总结内容,并将结果保存在特定文件夹中每个文本的文本文件中。
提供的代码运行良好,但缺少提示摘要的标题。尽管如此,它并不像 ChatGPT 那样实时显示结果。
***我希望看到控制台中呈现的逐个字符的实时更新,使我能够监控进度并在需要时停止该过程。但我不知道怎么办。在以下方面确实需要一些人的帮助,我们将不胜感激。
import os
import re
from llama_cpp import Llama
input_directory = r"C:\Users\Peter-Susan\Desktop\test"
output_directory = r"C:\Users\Peter-Susan\Desktop"
def join_lines_in_files(directory_path):
try:
# Get all text files in the directory
files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]
for file in files:
file_path = os.path.join(directory_path, file)
with open(file_path, 'r', encoding='utf-8') as f:
# Read the content of the file
content = f.read()
# Use regular expression to join lines within each paragraph with a single space
content = re.sub(r'(?<=\S)\n+', ' ', content)
# Use regular expression to remove spacing before the first word in the new line
content = re.sub(r'\n\s+', '\n', content)
# Use regular expression to ensure a single space between words when joining lines
content = re.sub(r'\s+', ' ', content)
# Overwrite the file with the updated content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print("Line joining and spacing removal completed successfully.")
except Exception as e:
print("An error occurred:", str(e))
# Define the path to the directory containing the text files
directory_path = input_directory
# Call the function to join lines and remove spacing in files in the specified directory
join_lines_in_files(directory_path)
# Load the Llama model
model_path = "./models/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=7)
def process_query(query, max_tokens=2048, temperature=0.1, top_p=0.5, stop=["#"]):
try:
# Generate the response using the query
response = llm(
query,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop=stop,
)
response_text = response["choices"][0]["text"].strip()
return response_text
except Exception as e:
print("Error generating response:", str(e))
return None
def get_title_from_path(file_path):
return os.path.splitext(os.path.basename(file_path))[0]
def process_text_file(input_file_path, output_directory):
# Read text from the input file
with open(input_file_path, "r", encoding="utf-8") as file:
file_content = file.read()
# Concatenate the header with the file content
header = 'Summarize in detail with at least 50 words: '
input_text = header + file_content
print(input_text)
# Process the input text using the Llama model
if input_text:
response = process_query(input_text)
if response:
# Write the generated output to a text file
output_file_path = os.path.join(
output_directory,
f"{get_title_from_path(input_file_path)}_summarized.txt"
)
with open(output_file_path, "w", encoding="utf-8") as output_file:
output_file.write(response)
print(f"Summarized content for '{input_file_path}' written to '{output_file_path}'.")
else:
print(f"Error generating response for '{input_file_path}'.")
else:
print(f"Error: Could not read text from '{input_file_path}'.")
if __name__ == "__main__":
# List all text files in the input directory
for filename in os.listdir(input_directory):
if filename.endswith(".txt"):
file_path = os.path.join(input_directory, filename)
process_text_file(file_path, output_directory)
即使是吟游诗人、Chatgpt 和 Claude 也无法提供帮助,可能他们不了解 Llama 模型。我希望看到控制台中呈现的逐个字符的实时更新,使我能够监控进度并在需要时停止该过程。