Book Appointment!
Chat with us!
Call us!

Home

Book Appointment

Chat with us

Call us

Back to top

Test002

import os
import pandas as pd
import re

# === CONFIG ===
input_folder = r"your\input\folder\path"   # 🔁 Update this path
output_file = "combined_output.xlsx"
keywords = ["DOS", "Patient", "DOB", "SSN"]
pattern = re.compile('|'.join(keywords), re.IGNORECASE)

combined_data = pd.DataFrame()

def find_matching_columns(row):
   """Return indices and names of columns matching the keywords."""
   return [(i, val) for i, val in enumerate(row) if pd.notna(val) and pattern.search(str(val))]

def extract_blocks(df_raw, source_file, sheet_name="CSV_File"):
   """Scan row-by-row to find blocks where keyword headers appear, extract data below."""
   blocks = []
   header_row_idx = None
   current_block = None
   block_start = None

   for i in range(len(df_raw)):
       row = df_raw.iloc[i].astype(str).fillna('').tolist()

       # Check if this row has keyword-like headers
       matched_cols = find_matching_columns(row)

       if matched_cols:
           # Finalize previous block if valid
           if header_row_idx is not None and block_start is not None and block_start > header_row_idx:
               data_block = df_raw.iloc[header_row_idx+1:block_start].copy()
               col_indices, col_names = zip(*find_matching_columns(df_raw.iloc[header_row_idx].tolist()))
               data_block = data_block.loc[:, col_indices]
               data_block.columns = col_names
               data_block = data_block.dropna(how='all')
               if not data_block.empty:
                   data_block['Source_File'] = source_file
                   data_block['Sheet_Name'] = sheet_name
                   data_block['Block_Header_Row'] = header_row_idx + 1
                   blocks.append(data_block)

           # Start new block
           header_row_idx = i
           block_start = i + 1

       elif header_row_idx is not None and i == len(df_raw) - 1:
           # Handle final row as end of last block
           data_block = df_raw.iloc[header_row_idx+1:].copy()
           col_indices, col_names = zip(*find_matching_columns(df_raw.iloc[header_row_idx].tolist()))
           data_block = data_block.loc[:, col_indices]
           data_block.columns = col_names
           data_block = data_block.dropna(how='all')
           if not data_block.empty:
               data_block['Source_File'] = source_file
               data_block['Sheet_Name'] = sheet_name
               data_block['Block_Header_Row'] = header_row_idx + 1
               blocks.append(data_block)

       elif header_row_idx is not None and pd.Series(row).isna().all():
           # Handle empty row as block separator
           data_block = df_raw.iloc[header_row_idx+1:i].copy()
           col_indices, col_names = zip(*find_matching_columns(df_raw.iloc[header_row_idx].tolist()))
           data_block = data_block.loc[:, col_indices]
           data_block.columns = col_names
           data_block = data_block.dropna(how='all')
           if not data_block.empty:
               data_block['Source_File'] = source_file
               data_block['Sheet_Name'] = sheet_name
               data_block['Block_Header_Row'] = header_row_idx + 1
               blocks.append(data_block)
           header_row_idx = None

   return blocks

def process_excel(filepath, ext):
   try:
       engine = 'openpyxl' if ext == '.xlsx' else 'xlrd'
       xls = pd.ExcelFile(filepath, engine=engine)
       file_blocks = []

       for sheet in xls.sheet_names:
           try:
               df_raw = xls.parse(sheet, header=None, dtype=str)
               if df_raw.empty:
                   continue
               blocks = extract_blocks(df_raw, os.path.basename(filepath), sheet)
               file_blocks.extend(blocks)
           except Exception as e:
               print(f"❌ Error in sheet '{sheet}' of {filepath}: {e}")

       return file_blocks
   except Exception as e:
       print(f"❌ Cannot read Excel file {filepath}: {e}")
       return []

def process_csv(filepath):
   try:
       df_raw = pd.read_csv(filepath, header=None, dtype=str, encoding='utf-8', errors='ignore')
       return extract_blocks(df_raw, os.path.basename(filepath))
   except Exception as e:
       print(f"❌ Cannot read CSV file {filepath}: {e}")
       return []

# === MAIN LOOP ===
for filename in os.listdir(input_folder):
   file_path = os.path.join(input_folder, filename)
   ext = os.path.splitext(filename)[1].lower()

   if ext in ['.xls', '.xlsx']:
       blocks = process_excel(file_path, ext)
   elif ext == '.csv':
       blocks = process_csv(file_path)
   else:
       print(f"⚠️ Skipping unsupported file: {filename}")
       continue

   for block in blocks:
       combined_data = pd.concat([combined_data, block], ignore_index=True)

# === SAVE OUTPUT ===
if not combined_data.empty:
   combined_data.to_excel(output_file, index=False)
   print(f"✅ Output saved: {output_file}")
else:
   print("⚠️ No matching data found.")