# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def create_chunks(text, n, tokenizer):
tokens = tokenizer.encode(text)
"""Yield successive n-sized chunks from text."""
i = 0
while i < len(tokens):
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
j = min(i + int(1.5 * n), len(tokens))
while j > i + int(0.5 * n):
# Decode the tokens and check for full stop or newline
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# If no end of sentence found, use n tokens as the chunk size
if j == i + int(0.5 * n):
j = min(i + n, len(tokens))
yield tokens[i:j]
i = j
def extract_chunk(document,template_prompt):
prompt = template_prompt.replace('<document>',document)
messages = [
{"role": "system", "content": "You help extract information from documents."},
{"role": "user", "content": prompt}
]
response = client.chat.completions.create(
model='gpt-4',
messages=messages,
temperature=0,
max_tokens=1500,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
return "1." + response.choices[0].message.content