Upload New File

9fd003ad · Syed Muhammad · 41172b79 · 9fd003ad
Commit 9fd003ad authored Jun 21, 2023 by Syed Muhammad
Hide whitespace changes
Inline Side-by-side

Showing with 53 additions and 0 deletions

chunk_1.py chunk_1.py +53 -0

No files found.
--- a/chunk_1.py
+++ b/chunk_1.py
+import os
+import glob
+import math
+import re
+
+def chunk_file(file_path, max_size):
+ chunks = []
+ with open(file_path, 'r', encoding='utf-8') as file:
+ content = file.read()
+ sentences = re.split(r'(?<=[.!?])\s+', content)
+ current_chunk = ""
+ for sentence in sentences:
+ if len(current_chunk.encode('utf-8')) + len(sentence.encode('utf-8')) > max_size:
+ if current_chunk:
+ chunks.append(current_chunk)
+ current_chunk = sentence
+ else:
+ current_chunk += sentence
+ if current_chunk:
+ chunks.append(current_chunk)
+ return chunks
+
+def store_chunks(chunks, output_folder, original_file_name):
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+ for i, chunk in enumerate(chunks, 1):
+ chunk_file_name = f'{original_file_name}_{i}.txt'
+ chunk_file_path = os.path.join(output_folder, chunk_file_name)
+ with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
+ chunk_file.write(chunk)
+
+def process_files(input_folder, output_folder, max_size):
+ if not os.path.exists(input_folder):
+ print(f"Input folder '{input_folder}' does not exist.")
+ return
+
+ txt_files = glob.glob(os.path.join(input_folder, '*.txt'))
+ if not txt_files:
+ print(f"No TXT files found in '{input_folder}'.")
+ return
+
+ for file_path in txt_files:
+ file_name = os.path.splitext(os.path.basename(file_path))[0]
+ chunks = chunk_file(file_path, max_size)
+ store_chunks(chunks, output_folder, file_name)
+
+# Usage
+input_folder = 'content/irs'
+output_folder = 'content/chunk'
+max_size = 7000 # Maximum size in bytes (15 KB)
+
+process_files(input_folder, output_folder, max_size)
\ No newline at end of file