import os
import pandas as pd
import re
# === CONFIG ===
input_folder = r"your\input\folder\path" # 🔁 Update this path
output_file = "combined_output.xlsx"
keywords = ["DOS", "Patient", "DOB", "SSN"]
pattern = re.compile('|'.join(keywords), re.IGNORECASE)
combined_data = pd.DataFrame()
def find_matching_columns(row):
"""Return indices and names of columns matching the keywords."""
return [(i, val) for i, val in enumerate(row) if pd.notna(val) and pattern.search(str(val))]
def extract_blocks(df_raw, source_file, sheet_name="CSV_File"):
"""Scan row-by-row to find blocks where keyword headers appear, extract data below."""
blocks = []
header_row_idx = None
current_block = None
block_start = None
for i in range(len(df_raw)):
row = df_raw.iloc[i].astype(str).fillna('').tolist()
# Check if this row has keyword-like headers
matched_cols = find_matching_columns(row)
if matched_cols:
# Finalize previous block if valid
if header_row_idx is not None and block_start is not None and block_start > header_row_idx:
data_block = df_raw.iloc[header_row_idx+1:block_start].copy()
col_indices, col_names = zip(*find_matching_columns(df_raw.iloc[header_row_idx].tolist()))
data_block = data_block.loc[:, col_indices]
data_block.columns = col_names
data_block = data_block.dropna(how='all')
if not data_block.empty:
data_block['Source_File'] = source_file
data_block['Sheet_Name'] = sheet_name
data_block['Block_Header_Row'] = header_row_idx + 1
blocks.append(data_block)
# Start new block
header_row_idx = i
block_start = i + 1
elif header_row_idx is not None and i == len(df_raw) - 1:
# Handle final row as end of last block
data_block = df_raw.iloc[header_row_idx+1:].copy()
col_indices, col_names = zip(*find_matching_columns(df_raw.iloc[header_row_idx].tolist()))
data_block = data_block.loc[:, col_indices]
data_block.columns = col_names
data_block = data_block.dropna(how='all')
if not data_block.empty:
data_block['Source_File'] = source_file
data_block['Sheet_Name'] = sheet_name
data_block['Block_Header_Row'] = header_row_idx + 1
blocks.append(data_block)
elif header_row_idx is not None and pd.Series(row).isna().all():
# Handle empty row as block separator
data_block = df_raw.iloc[header_row_idx+1:i].copy()
col_indices, col_names = zip(*find_matching_columns(df_raw.iloc[header_row_idx].tolist()))
data_block = data_block.loc[:, col_indices]
data_block.columns = col_names
data_block = data_block.dropna(how='all')
if not data_block.empty:
data_block['Source_File'] = source_file
data_block['Sheet_Name'] = sheet_name
data_block['Block_Header_Row'] = header_row_idx + 1
blocks.append(data_block)
header_row_idx = None
return blocks
def process_excel(filepath, ext):
try:
engine = 'openpyxl' if ext == '.xlsx' else 'xlrd'
xls = pd.ExcelFile(filepath, engine=engine)
file_blocks = []
for sheet in xls.sheet_names:
try:
df_raw = xls.parse(sheet, header=None, dtype=str)
if df_raw.empty:
continue
blocks = extract_blocks(df_raw, os.path.basename(filepath), sheet)
file_blocks.extend(blocks)
except Exception as e:
print(f"❌ Error in sheet '{sheet}' of {filepath}: {e}")
return file_blocks
except Exception as e:
print(f"❌ Cannot read Excel file {filepath}: {e}")
return []
def process_csv(filepath):
try:
df_raw = pd.read_csv(filepath, header=None, dtype=str, encoding='utf-8', errors='ignore')
return extract_blocks(df_raw, os.path.basename(filepath))
except Exception as e:
print(f"❌ Cannot read CSV file {filepath}: {e}")
return []
# === MAIN LOOP ===
for filename in os.listdir(input_folder):
file_path = os.path.join(input_folder, filename)
ext = os.path.splitext(filename)[1].lower()
if ext in ['.xls', '.xlsx']:
blocks = process_excel(file_path, ext)
elif ext == '.csv':
blocks = process_csv(file_path)
else:
print(f"⚠️ Skipping unsupported file: {filename}")
continue
for block in blocks:
combined_data = pd.concat([combined_data, block], ignore_index=True)
# === SAVE OUTPUT ===
if not combined_data.empty:
combined_data.to_excel(output_file, index=False)
print(f"✅ Output saved: {output_file}")
else:
print("⚠️ No matching data found.")