Skip to content

langchain使用token切分chunk

from modelscope.hub.snapshot_download import snapshot_download
from transformers import AutoTokenizer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import PyPDF2


def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text


pdf_text = extract_text_from_pdf('/Users/xiniao/Downloads/FATF-ME-FATF-HK-FUR-4-1-202302.pdf')
print(pdf_text)

if __name__ == '__main__':
    model_id = 'Qwen/Qwen2.5-72B-Instruct'
    # cache_dir = './models'
    ignore_patterns=['*.safetensors']
    print("开始下载模型")
    model_dir = snapshot_download(model_id, ignore_patterns=ignore_patterns)
    print(model_dir)
    print("开始加载tokenizer")

    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    print(tokenizer)
    encode = tokenizer.encode('你好')

    print(encode)

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer, chunk_size=1000, chunk_overlap=100
    )
    texts = text_splitter.split_text(pdf_text)

    for t in texts:
        print("-" * 80)
        print()
        print(t)