Commit 9fd003ad by Syed Muhammad

Upload New File

parent 41172b79
import os
import glob
import math
import re
def chunk_file(file_path, max_size):
chunks = []
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
sentences = re.split(r'(?<=[.!?])\s+', content)
current_chunk = ""
for sentence in sentences:
if len(current_chunk.encode('utf-8')) + len(sentence.encode('utf-8')) > max_size:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence
else:
current_chunk += sentence
if current_chunk:
chunks.append(current_chunk)
return chunks
def store_chunks(chunks, output_folder, original_file_name):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for i, chunk in enumerate(chunks, 1):
chunk_file_name = f'{original_file_name}_{i}.txt'
chunk_file_path = os.path.join(output_folder, chunk_file_name)
with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
chunk_file.write(chunk)
def process_files(input_folder, output_folder, max_size):
if not os.path.exists(input_folder):
print(f"Input folder '{input_folder}' does not exist.")
return
txt_files = glob.glob(os.path.join(input_folder, '*.txt'))
if not txt_files:
print(f"No TXT files found in '{input_folder}'.")
return
for file_path in txt_files:
file_name = os.path.splitext(os.path.basename(file_path))[0]
chunks = chunk_file(file_path, max_size)
store_chunks(chunks, output_folder, file_name)
# Usage
input_folder = 'content/irs'
output_folder = 'content/chunk'
max_size = 7000 # Maximum size in bytes (15 KB)
process_files(input_folder, output_folder, max_size)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment