#!/usr/bin/env python3
"""
AIE4 Cron Job - Automated Testing Script

This script orchestrates the full test suite using pytest_lsf.py
"""

import re
import os
import subprocess
import time
import traceback
import json
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from pathlib import Path

import typer
from jinja2 import Template
import markdown
from tabulate import tabulate
from graph.utilities import logger
from buildtest.common import BuildTarget
from buildtest.pytest_lsf import get_test_list, submit_lsf_job, sanitize_job_name

# Paths
REPO_DIR = Path(os.environ.get("AIE4_ROOT_DIR"))
BUILDTEST_DIR = REPO_DIR / "buildtest"
AIE4_BENCH_DIR = REPO_DIR / "aie4_bench"
OUTPUT_DIR = REPO_DIR / "Output"
LSF_LOG_DIR = BUILDTEST_DIR / "lsf_logs"

# Job configuration
MAX_WAIT_SECONDS = 3 * 60 * 60  # 3 hours
WAIT_INTERVAL = 10  # seconds (should be >10 seconds)

# Email report configuration
MAX_JOBS_FOR_DETAILED_LOGS = 10  # Only include detailed logs if failures <= this number


class LSFJobStatus(str, Enum):
    """LSF job status codes"""
    PEND = "PEND"      # Job is pending (waiting to be scheduled)
    RUN = "RUN"        # Job is running
    DONE = "DONE"      # Job completed successfully
    EXIT = "EXIT"      # Job exited with error
    SSUSP = "SSUSP"    # Job suspended by system
    USUSP = "USUSP"    # Job suspended by user
    PSUSP = "PSUSP"    # Job suspended due to higher priority job
    WAIT = "WAIT"      # Job waiting for dependency
    ZOMBI = "ZOMBI"    # Zombie job
    UNKWN = "UNKWN"    # Unknown status
    NOT_FOUND = "NOT_FOUND"  # Job not found in LSF system

    @classmethod
    def from_bjobs(cls, status_str: str) -> "LSFJobStatus":
        """Parse status from bjobs output"""
        try:
            return cls(status_str)
        except ValueError:
            return cls.UNKWN

    def is_active(self) -> bool:
        """Check if job is still active (running or pending)"""
        return self in {self.RUN, self.PEND, self.SSUSP, self.USUSP, self.PSUSP, self.WAIT}


@dataclass
class JobInfo:
    """Information about a submitted LSF job"""
    job_id: str
    job_name: str
    test_id: str
    out_file: Path
    err_file: Path


@dataclass
class JobRuntime:
    """Runtime information from bjobs JSON output"""
    job_id: str
    status: LSFJobStatus
    run_time_seconds: int  # Parsed runtime in seconds
    run_time_raw: str      # Raw string from bjobs (e.g., "123 second(s)")

    @property
    def is_timeout(self) -> bool:
        """Check if job has exceeded MAX_WAIT_SECONDS"""
        return self.run_time_seconds >= MAX_WAIT_SECONDS

    @classmethod
    def from_bjobs_json(cls, record: dict) -> "JobRuntime | None":
        """
        Parse JobRuntime from bjobs JSON record

        Args:
            record: Single record from bjobs -json output
                    Example: {"JOBID": "12345", "STAT": "RUN", "RUN_TIME": "123 second(s)"}
                    Or: {"JOBID": "12345", "ERROR": "Job <12345> is not found"}

        Returns:
            JobRuntime instance, or None if job is not found (completed/purged)
        """
        job_id = record["JOBID"]

        # Check if job was not found (already completed and purged from LSF)
        if "ERROR" in record:
            logger.debug("Job %s not found in bjobs (likely completed): %s", job_id, record["ERROR"])
            return None

        status = LSFJobStatus.from_bjobs(record["STAT"])
        run_time_raw = record.get("RUN_TIME", "0 second(s)")

        # Parse "123 second(s)" to integer seconds
        run_time_seconds = cls._parse_run_time(run_time_raw)

        return cls(
            job_id=job_id,
            status=status,
            run_time_seconds=run_time_seconds,
            run_time_raw=run_time_raw
        )

    @staticmethod
    def _parse_run_time(run_time_str: str) -> int:
        """
        Parse LSF RUN_TIME string to seconds

        Examples:
            "123 second(s)" -> 123
            "5 minute(s)" -> 300
            "2 hour(s)" -> 7200

        Args:
            run_time_str: RUN_TIME string from bjobs

        Returns:
            Runtime in seconds
        """
        try:
            parts = run_time_str.split()
            if len(parts) < 2:
                return 0

            value = int(parts[0])
            unit = parts[1].lower()

            if "second" in unit:
                return value
            elif "minute" in unit:
                return value * 60
            elif "hour" in unit:
                return value * 3600
            elif "day" in unit:
                return value * 86400
            else:
                logger.warning("Unknown time unit in '%s', defaulting to 0", run_time_str)
                return 0

        except (ValueError, IndexError) as e:
            logger.warning("Failed to parse RUN_TIME '%s': %s", run_time_str, e)
            return 0


@dataclass
class FailedJob:
    """Information about a failed job"""
    job_id: str
    job_name: str
    exit_code: str
    log_file: str
    status: LSFJobStatus = LSFJobStatus.DONE


def submit_all_tests(
    job_prefix: str,
    output_dir: Path,
    test_filter: str,
    target: BuildTarget,
    output_root: str | None = None,
    hwtest: bool = False
) -> tuple[int, list[JobInfo]]:
    """
    Submit all tests using pytest_lsf functions directly

    Args:
        job_prefix: Prefix for job names
        output_dir: Directory for LSF output files
        test_filter: Pytest filter string (-k argument)
        target: Build target enum value
        output_root: Output root directory for pytest (optional)
        hwtest: Run tests on hardware

    Returns:
        (exit_code, list of submitted JobInfo objects)
    """
    os.chdir(BUILDTEST_DIR)

    # Get list of all tests in buildtest directory
    logger.info("Collecting tests from buildtest directory...")
    tests = get_test_list(["-k", test_filter, "-m", "dma"])  # Collect all tests from current directory

    if not tests:
        logger.error("No tests collected!")
        return 1, []

    logger.info("Found %d test(s) to submit", len(tests))

    # Submit each test and track job IDs
    submitted_jobs = []
    failed_count = 0

    for i, test_id in enumerate(tests, 1):
        # Generate job name
        job_name = f"{job_prefix}_{sanitize_job_name(test_id)}"

        logger.info("[%d/%d] Submitting: %s", i, len(tests), test_id)

        job_id = submit_lsf_job(
            test_id=test_id,
            target=target,
            job_name=job_name,
            queue="medium",
            mem_limit="16GB",
            output_dir=output_dir,
            output_root=output_root,
            dry_run=False,
            hwtest=hwtest
        )

        if job_id:
            submitted_jobs.append(JobInfo(
                job_id=job_id,
                job_name=job_name,
                test_id=test_id,
                out_file=output_dir / f"{job_name}.out",
                err_file=output_dir / f"{job_name}.err"
            ))
        else:
            failed_count += 1

    logger.info("Successfully submitted: %d job(s)", len(submitted_jobs))
    if failed_count > 0:
        logger.warning("Failed to submit: %d job(s)", failed_count)
        return 1, submitted_jobs

    return 0, submitted_jobs


def submit_model_compilation_tests(
    job_prefix: str,
    output_dir: Path,
    with_model_data: bool = False,
    model_filter: str | None = None,
) -> tuple[int, list[JobInfo]]:
    """
    Submit model compilation tests using pytest_lsf functions directly.

    These tests are different from buildtest tests:
    - Located in aie4_bench/test_model_compilation.py
    - Require --run-model-compilation flag
    - Use e2e_model_compilation marker
    - Need more memory (32GB) for full model compilation
    - Do NOT use --target (that's a buildtest-specific fixture)

    Args:
        job_prefix: Prefix for job names
        output_dir: Directory for LSF output files (artifacts will be created under this)
        with_model_data: If True, include --with-model-data flag (only models with DataGen)
        model_filter: Glob-style pattern to filter models by name (e.g., 'psd*', 'Intel_bert')

    Returns:
        (exit_code, list of submitted JobInfo objects)
    """
    os.chdir(AIE4_BENCH_DIR)

    # Build pytest collection args
    collect_args = ["-m", "e2e_model_compilation", "--run-model-compilation"]
    if with_model_data:
        collect_args.append("--with-model-data")
    if model_filter:
        collect_args.extend(["--model-filter", model_filter])
    collect_args.append("test_model_compilation.py")

    # Get list of model compilation tests
    # NOTE: --run-model-compilation is required during collection to trigger pytest_generate_tests
    mode_str = "with-model-data" if with_model_data else "standard"
    logger.info("Collecting model compilation tests from aie4_bench directory (mode: %s)...", mode_str)
    tests = get_test_list(collect_args)

    if not tests:
        logger.error("No model compilation tests collected!")
        return 1, []

    logger.info("Found %d model compilation test(s) to submit", len(tests))

    # Create artifacts directory under output_dir for all model compilation results
    artifacts_dir = output_dir / "artifacts"
    artifacts_dir.mkdir(parents=True, exist_ok=True)

    # Build extra pytest args
    extra_pytest_args = ["--run-model-compilation"]
    if with_model_data:
        extra_pytest_args.append("--with-model-data")
    if model_filter:
        extra_pytest_args.extend(["--model-filter", model_filter])

    # Submit each test and track job IDs
    submitted_jobs = []
    failed_count = 0

    for i, test_id in enumerate(tests, 1):
        # Generate job name
        job_name = f"{job_prefix}_{sanitize_job_name(test_id)}"

        # Each job gets its own artifact subdirectory under artifacts/
        job_artifact_dir = artifacts_dir / job_name
        job_artifact_dir.mkdir(parents=True, exist_ok=True)

        logger.info("[%d/%d] Submitting: %s", i, len(tests), test_id)

        job_id = submit_lsf_job(
            test_id=test_id,
            # No target - model compilation tests don't use buildtest's --target fixture
            job_name=job_name,
            queue="long",  # Use long queue for model compilation (can take hours)
            mem_limit="32GB",  # Model compilation needs more memory
            output_dir=output_dir,
            dry_run=False,
            extra_pytest_args=extra_pytest_args,
            artifact_dir=job_artifact_dir,  # Each job writes artifacts here
        )

        if job_id:
            submitted_jobs.append(JobInfo(
                job_id=job_id,
                job_name=job_name,
                test_id=test_id,
                out_file=output_dir / f"{job_name}.out",
                err_file=output_dir / f"{job_name}.err"
            ))
        else:
            failed_count += 1

    logger.info("Successfully submitted: %d job(s)", len(submitted_jobs))
    if failed_count > 0:
        logger.warning("Failed to submit: %d job(s)", failed_count)
        return 1, submitted_jobs

    return 0, submitted_jobs


def get_job_status(job_ids: list[str]) -> dict[str, LSFJobStatus]:
    """
    Get current status of LSF jobs using bjobs

    Args:
        job_ids: List of job IDs to query

    Returns:
        Dict mapping job_id -> LSFJobStatus enum value
    """
    if not job_ids:
        return {}

    status_map: dict[str, LSFJobStatus] = {}

    # Initialize all as NOT_FOUND
    for job_id in job_ids:
        status_map[job_id] = LSFJobStatus.NOT_FOUND

    # Query bjobs
    result = subprocess.run(
        ["bjobs", "-w"] + job_ids,
        capture_output=True,
        text=True,
        check=False
    )

    # Parse bjobs output
    for line in result.stdout.splitlines():
        if line.strip() and line.strip()[0].isdigit():
            parts = line.split()
            if len(parts) >= 3:
                job_id = parts[0]
                status_str = parts[2]  # STAT column
                status_map[job_id] = LSFJobStatus.from_bjobs(status_str)

    return status_map


def get_job_runtimes(job_ids: list[str]) -> dict[str, JobRuntime]:
    """
    Get runtime information for LSF jobs using bjobs JSON output

    Uses bjobs -o "jobid stat run_time" -json to get structured runtime data.

    Args:
        job_ids: List of job IDs to query

    Returns:
        Dict mapping job_id -> JobRuntime with parsed runtime information
    """
    if not job_ids:
        return {}

    runtime_map: dict[str, JobRuntime] = {}

    # Query bjobs with JSON output for clean parsing
    result = subprocess.run(
        ["bjobs", "-o", "jobid stat run_time", "-json"] + job_ids,
        capture_output=True,
        text=True,
        check=False
    )

    if result.returncode != 0 and result.returncode != 255:
        # returncode 255 is normal when some jobs don't exist
        logger.warning("bjobs returned code %d: %s", result.returncode, result.stderr)
        return runtime_map

    try:
        data = json.loads(result.stdout)
        records = data.get("RECORDS", [])

        for record in records:
            job_runtime = JobRuntime.from_bjobs_json(record)
            # Skip jobs that are not found (already completed/purged)
            if job_runtime is not None:
                runtime_map[job_runtime.job_id] = job_runtime

    except (json.JSONDecodeError, KeyError) as e:
        logger.error("Failed to parse bjobs JSON output: %s", e)
        logger.debug("bjobs output was: %s", result.stdout)

    return runtime_map


def wait_for_jobs(submitted_jobs: list[JobInfo]) -> tuple[bool, str, list[JobInfo], list[dict]]:
    """
    Wait for all submitted jobs to complete, killing any that exceed 3 hour runtime

    Tracks per-job elapsed time using bjobs RUN_TIME and kills jobs individually
    when they exceed MAX_WAIT_SECONDS, allowing faster jobs to complete.

    Args:
        submitted_jobs: List of JobInfo objects

    Returns:
        (success, incomplete_jobs_table, incomplete_jobs_list, captured_logs)
    """
    if not submitted_jobs:
        logger.warning("No jobs to wait for")
        return True, "", [], []

    job_ids = [job.job_id for job in submitted_jobs]
    logger.info("Waiting for %d job(s): %s", len(job_ids), ", ".join(job_ids))

    # Track which jobs we've seen at least once
    jobs_seen = {job_id: False for job_id in job_ids}
    # Track jobs killed due to timeout
    jobs_killed = set()
    # Track jobs that have completed (no longer in bjobs output)
    jobs_completed = set()
    # Store logs captured before killing
    captured_timeout_logs: dict[str, dict] = {}
    # Store runtime info captured when timeout detected (before job is killed/purged)
    captured_runtime_info: dict[str, JobRuntime] = {}

    # Give jobs time to appear in LSF system
    time.sleep(5)
    total_elapsed = 0

    while True:
        # Only query jobs that haven't completed yet
        jobs_to_query = [jid for jid in job_ids if jid not in jobs_completed and jid not in jobs_killed]

        # Get runtime information for jobs still potentially running
        runtime_map = get_job_runtimes(jobs_to_query)
        still_running = []
        timeout_jobs = []

        # Check which jobs are still active and which have timed out
        for job_id in jobs_to_query:
            runtime_info = runtime_map.get(job_id)

            if runtime_info:
                # Mark that we've seen this job
                jobs_seen[job_id] = True

                # Check if job is still active
                if runtime_info.status.is_active():
                    still_running.append(job_id)

                    # Check if this job has exceeded timeout
                    if runtime_info.is_timeout and job_id not in jobs_killed:
                        timeout_jobs.append(job_id)
                        # CRITICAL: Save runtime info NOW before killing (job will be purged from bjobs)
                        captured_runtime_info[job_id] = runtime_info
                        logger.warning("Job %s exceeded %d seconds (runtime: %s) - will kill",
                                       job_id, MAX_WAIT_SECONDS, runtime_info.run_time_raw)
                else:
                    # Job is no longer active (DONE/EXIT) - mark as completed
                    jobs_completed.add(job_id)
                    logger.debug("Job %s completed with status %s", job_id, runtime_info.status.name)
            else:
                # Job not found in bjobs output
                if jobs_seen.get(job_id, False):
                    # We've seen it before, so it must have completed and been purged
                    jobs_completed.add(job_id)
                    logger.debug("Job %s not found in bjobs (completed and purged)", job_id)
                # else: Job hasn't appeared yet, keep waiting

        # Handle timeout jobs: capture logs BEFORE killing
        if timeout_jobs:
            logger.info("Capturing logs from %d job(s) before termination...", len(timeout_jobs))
            timeout_job_infos = [j for j in submitted_jobs if j.job_id in timeout_jobs]
            timeout_logs = get_incomplete_job_logs(timeout_job_infos)

            # Store captured logs for later retrieval
            for job_info, log in zip(timeout_job_infos, timeout_logs):
                captured_timeout_logs[job_info.job_id] = log

            # Now kill the jobs
            logger.info("Killing %d job(s) that exceeded timeout", len(timeout_jobs))
            subprocess.run(["bkill"] + timeout_jobs, check=False)
            jobs_killed.update(timeout_jobs)

        # Check if all jobs have been seen at least once
        all_jobs_seen = all(jobs_seen.values())

        # Remove killed jobs from still_running list
        still_running = [jid for jid in still_running if jid not in jobs_killed]

        if not still_running and all_jobs_seen:
            if jobs_killed:
                logger.warning("All jobs finished, but %d were killed due to timeout", len(jobs_killed))
                incomplete_table, incomplete_jobs, captured_logs = handle_killed_jobs(
                    jobs_killed, submitted_jobs, captured_runtime_info, captured_timeout_logs
                )
                return False, incomplete_table, incomplete_jobs, captured_logs
            else:
                logger.info("All jobs finished successfully!")
                return True, "", [], []

        if not still_running and not all_jobs_seen:
            # Jobs haven't appeared yet, keep waiting
            unseen_jobs = [jid for jid, seen in jobs_seen.items() if not seen]
            logger.info("Waiting for jobs to appear in LSF: %s (total elapsed: %ds)",
                        ", ".join(unseen_jobs), total_elapsed)
        else:
            # Log status with per-job runtime info
            running_info = []
            for jid in still_running[:5]:  # Show first 5 jobs
                runtime_info = runtime_map.get(jid)
                if runtime_info:
                    running_info.append(f"{jid}({runtime_info.run_time_seconds}s)")
                else:
                    running_info.append(jid)

            more = f" +{len(still_running)-5} more" if len(still_running) > 5 else ""
            logger.info("Still running: %d job(s) [%s%s] (total elapsed: %ds)",
                        len(still_running), ", ".join(running_info), more, total_elapsed)

        time.sleep(WAIT_INTERVAL)
        total_elapsed += WAIT_INTERVAL


def handle_killed_jobs(
    killed_job_ids: set[str],
    submitted_jobs: list[JobInfo],
    captured_runtime_info: dict[str, JobRuntime],
    captured_logs_map: dict[str, dict]
) -> tuple[str, list[JobInfo], list[dict]]:
    """
    Handle killed jobs - format table with runtime info, use pre-captured logs

    Args:
        killed_job_ids: Set of job IDs that were killed due to timeout
        submitted_jobs: All submitted jobs
        captured_runtime_info: Runtime info captured when timeout was detected (before kill)
        captured_logs_map: Pre-captured logs (job_id -> log dict)

    Returns:
        (incomplete_table, list of incomplete JobInfo objects, captured logs list)
    """
    table_data = []
    incomplete_jobs = []
    captured_logs = []

    for job_id in killed_job_ids:
        # Find job info from submitted_jobs
        job_info = next((j for j in submitted_jobs if j.job_id == job_id), None)
        runtime_info = captured_runtime_info.get(job_id)

        if job_info:
            # We always have job_info since we stored it when submitting
            if runtime_info:
                # We captured runtime info before killing
                elapsed_time = f"{runtime_info.run_time_seconds}s ({runtime_info.run_time_seconds // 3600}h {(runtime_info.run_time_seconds % 3600) // 60}m)"
                status = runtime_info.status.value
            else:
                # Job was purged before we could capture runtime info
                elapsed_time = "unknown (purged)"
                status = "TIMEOUT"

            table_data.append([
                job_id,
                job_info.job_name,
                status,
                elapsed_time,
                job_info.test_id
            ])
            incomplete_jobs.append(job_info)

            # Get pre-captured log for this job
            if job_id in captured_logs_map:
                captured_logs.append(captured_logs_map[job_id])
            else:
                logger.warning("No pre-captured log for job %s", job_id)
                # Fallback: create empty log entry
                captured_logs.append({
                    "job_name": job_info.job_name,
                    "stderr": "",
                    "stdout": ""
                })
        else:
            # This should never happen - we don't have the job info we submitted!
            logger.error("Job %s not found in submitted_jobs - this is a bug!", job_id)
            table_data.append([job_id, "unknown (BUG)", "TIMEOUT", "unknown", "unknown"])

    incomplete_table = tabulate(
        table_data,
        headers=["Job ID", "Job Name", "Status", "Elapsed Time", "Test ID"],
        tablefmt="github"
    )

    logger.info("Killed jobs:")
    logger.info("\n%s", incomplete_table)
    logger.info("Using pre-captured logs for %d killed job(s)", len(captured_logs))

    return incomplete_table, incomplete_jobs, captured_logs


def check_di_pass(submitted_jobs: list[JobInfo]) -> tuple[bool, str]:
    """
    Check for DI_PASS in LSF output files from submitted jobs

    This searches the .out files directly since they contain DI_PASS/DI_FAIL.
    Missing DI_PASS marker indicates test failure (compilation error, incomplete sim, etc).
    Uses efficient line-by-line reading to handle large (100+ MB) files.

    Args:
        submitted_jobs: List of JobInfo objects from current run

    Returns:
        (all_passed, failure_details) where failure_details contains info about failed tests
    """
    if not submitted_jobs:
        return True, ""

    failures = []
    all_passed = True

    for job_info in submitted_jobs:
        if not job_info.out_file.exists():
            failures.append(f"\n{job_info.job_name}: OUTPUT FILE NOT FOUND")
            all_passed = False
            continue

        try:
            # Read file line-by-line to handle large files efficiently
            has_compilation_pass = False
            has_di_pass = False
            di_fail_lines = []
            error_lines = []

            with open(job_info.out_file, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    if "Compilation Complete" in line:
                        has_compilation_pass = True
                    elif 'DI_PASS' in line:
                        has_di_pass = True
                    elif 'DI_FAIL:' in line:
                        di_fail_lines.append(line.strip())
                    elif 'ERROR:' in line or 'FAILED' in line or 'fatal error:' in line:
                        error_lines.append(line.strip())

            # Report failure if DI_PASS not found
            if not has_compilation_pass:
                all_passed = False
                failure_msg = f"{job_info.test_id}: COMPILATION FAILED"
                failures.append(failure_msg)
            elif not has_di_pass:
                all_passed = False
                failure_msg = f"{job_info.test_id}: COMPILATION SUCCEEDED, SIMULATION FAILED"
                failures.append(failure_msg)

        except Exception as e:  # pylint: disable=W0718
            logger.warning("Error checking DI_PASS for %s: %s", job_info.job_name, e)
            failures.append(f"\n{job_info.job_name}: ERROR READING LOG - {str(e)}")
            all_passed = False
            continue

    failure_details = ("\n".join(failures)).lstrip() if failures else ""
    return all_passed, failure_details


def read_last_lines(file_path: Path, num_lines: int = 20) -> str:
    """
    Efficiently read the last N lines of a file without loading entire file into memory.

    For large files (>1MB), uses seek to read from end.
    For smaller files, reads entire file.

    Args:
        file_path: Path to the file
        num_lines: Number of lines to read from end (default: 20)

    Returns:
        String containing last N lines
    """
    if not file_path.exists():
        return ""

    try:
        file_size = file_path.stat().st_size

        # For small files (<1MB), just read normally
        if file_size < 1_000_000:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
                return "".join(lines[-num_lines:])

        # For large files, read from end
        with open(file_path, 'rb') as f:
            # Start with a reasonable buffer (100KB should contain 20 lines)
            buffer_size = 100_000
            f.seek(0, 2)  # Seek to end
            file_size = f.tell()

            # Read progressively larger chunks until we have enough lines
            buffer_size = min(buffer_size, file_size)
            f.seek(max(0, file_size - buffer_size))

            data = f.read()

        # Decode and split into lines
        text = data.decode('utf-8', errors='ignore')
        lines = text.splitlines(keepends=True)

        # Return last N lines (skip first partial line if we didn't read from start)
        if file_size > buffer_size and lines:
            lines = lines[1:]  # First line might be partial

        return "".join(lines[-num_lines:])

    except Exception as e:  # pylint: disable=W0718
        logger.warning("Error reading %s: %s", file_path, e)
        return ""


def check_job_failures(submitted_jobs: list[JobInfo]) -> tuple[bool, str]:
    """
    Check if any jobs failed by examining LSF output files

    This is more reliable than bhist which can be slow or fail.
    LSF writes "Successfully completed" or "Exited with exit code" to .out files.

    Args:
        submitted_jobs: List of JobInfo objects

    Returns:
        (all_passed, failed_jobs_table)
    """
    logger.info("Checking job exit status from LSF log files...")

    if not submitted_jobs:
        logger.warning("No jobs submitted")
        return True, ""

    # Get current status of all jobs from LSF
    job_ids = [job.job_id for job in submitted_jobs]
    status_map = get_job_status(job_ids)

    failed_jobs: list[FailedJob] = []
    successful_jobs = 0

    for job_info in submitted_jobs:
        job_status = status_map.get(job_info.job_id, LSFJobStatus.UNKWN)

        if not job_info.out_file.exists():
            logger.warning("Output file not found: %s", job_info.out_file)
            failed_jobs.append(FailedJob(
                job_id=job_info.job_id,
                job_name=job_info.job_name,
                exit_code="no_output_file",
                log_file=str(job_info.out_file),
                status=job_status
            ))
            continue

        try:
            # LSF appends completion messages at the end of .out files
            # Read last portion of file efficiently
            tail_content = read_last_lines(job_info.out_file, num_lines=50)

            # Check for LSF completion messages
            if "Successfully completed" in tail_content:
                successful_jobs += 1
                logger.debug("Job succeeded: %s", job_info.job_name)
            elif "Exited with exit code" in tail_content:
                # Extract exit code
                match = re.search(r"Exited with exit code (\d+)", tail_content)
                exit_code = match.group(1) if match else "unknown"
                failed_jobs.append(FailedJob(
                    job_id=job_info.job_id,
                    job_name=job_info.job_name,
                    exit_code=exit_code,
                    log_file=str(job_info.out_file),
                    status=job_status
                ))
                logger.error("Job FAILED: %s (exit code: %s, status: %s)",
                             job_info.job_name, exit_code, job_status.value)
            else:
                # No completion message found - job may not have finished properly
                logger.warning("No completion status found for: %s (LSF status: %s)",
                               job_info.job_name, job_status.value)
                failed_jobs.append(FailedJob(
                    job_id=job_info.job_id,
                    job_name=job_info.job_name,
                    exit_code="unknown",
                    log_file=str(job_info.out_file),
                    status=job_status
                ))

        except Exception as e:  # pylint: disable=W0718
            logger.error("Error reading %s: %s", job_info.out_file, e)
            failed_jobs.append(FailedJob(
                job_id=job_info.job_id,
                job_name=job_info.job_name,
                exit_code="read_error",
                log_file=str(job_info.out_file),
                status=job_status
            ))

    if failed_jobs:
        # Generate table of failed jobs with status column
        table_data = [[job.job_id, job.job_name, job.exit_code, job.status.value] for job in failed_jobs]
        failed_table = tabulate(
            table_data,
            headers=["Job ID", "Job Name", "Exit Code", "LSF Status"],
            tablefmt="github"
        )
        logger.error("Found %d failed job(s) out of %d total", len(failed_jobs), len(submitted_jobs))
        return False, failed_table
    else:
        logger.info("All %d jobs completed successfully", successful_jobs)
        return True, ""


def get_incomplete_job_logs(incomplete_jobs: list[JobInfo]) -> list[dict]:
    """
    Get log excerpts from incomplete/failed jobs

    Args:
        incomplete_jobs: List of JobInfo objects for jobs that didn't complete

    Returns:
        List of dicts with job_name, stderr, and stdout content
    """
    job_logs = []

    for job_info in incomplete_jobs:
        # Efficiently read last 20 lines without loading entire file
        stderr_content = read_last_lines(job_info.err_file, num_lines=20)
        stdout_content = read_last_lines(job_info.out_file, num_lines=20)

        job_logs.append({
            "job_name": job_info.job_name,
            "stderr": stderr_content,
            "stdout": stdout_content
        })

    return job_logs


def generate_report(output_dir: Path, job_prefix: str, exit_code: int, incomplete_table: str,
                    submitted_jobs: list[JobInfo], incomplete_logs: list[dict]) -> tuple[str, str]:
    """
    Generate markdown and HTML reports

    Smart about log inclusion: if there are many failures (>MAX_JOBS_FOR_DETAILED_LOGS),
    omit the detailed logs to keep email size manageable.

    Args:
        output_dir: Directory for output files
        job_prefix: Prefix for job names
        exit_code: Exit code of the run
        incomplete_table: Table of incomplete jobs
        submitted_jobs: All submitted jobs
        incomplete_logs: Pre-captured logs from incomplete jobs (captured before killing)
    """
    status = "✅ SUCCESS" if exit_code == 0 else "❌ FAILED"
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Check for DI_PASS markers in all jobs
    _, di_pass_failures = check_di_pass(submitted_jobs)

    # Determine whether to include detailed logs based on failure count
    num_failures = len(incomplete_logs)
    include_detailed_logs = num_failures <= MAX_JOBS_FOR_DETAILED_LOGS

    if not include_detailed_logs:
        logger.info(
            "Omitting detailed logs from email report (%d failures > %d threshold)",
            num_failures, MAX_JOBS_FOR_DETAILED_LOGS
        )
        # Create a summary message instead of full logs
        logs_summary = (
            f"**Note:** Detailed logs omitted due to large number of failures ({num_failures} jobs).\n\n"
            f"Full logs are available in the directory: `{output_dir}`"
        )
    else:
        logs_summary = None

    # Build context for template
    context = {
        "timestamp": timestamp,
        "status": status,
        "job_prefix": job_prefix,
        "log_location": f"{str(output_dir)}/{job_prefix}_*",
        "di_fail_errors": di_pass_failures,
        "incomplete_jobs_table": incomplete_table,
        "incomplete_job_logs": incomplete_logs if include_detailed_logs else [],
        "logs_omitted_message": logs_summary if not include_detailed_logs else "",
        "num_failures": num_failures,
    }

    # Render markdown template
    template_file = Path(__file__).parent / "report_template.md"
    with open(template_file, encoding="utf-8") as f:
        template_content = f.read()
    template = Template(template_content)
    markdown_content = template.render(**context)

    # Convert to HTML
    html_body = markdown.markdown(
        markdown_content,
        extensions=['tables', 'fenced_code', 'nl2br']
    )

    # Render HTML template
    html_template_file = Path(__file__).parent / "report_template.html"
    with open(html_template_file, encoding="utf-8") as f:
        html_template_content = f.read()
    html_template = Template(html_template_content)
    html = html_template.render(html_body=html_body, status=status.replace("✅ ", "").replace("❌ ", ""))

    return markdown_content, html


def main(
    job_prefix: str = typer.Option(
        f"cron_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.getpid()}_{os.urandom(2).hex()}",
        "--job-prefix",
        help="Custom job prefix. If not provided, auto-generated from timestamp and PID"
    ),
    output_dir: str = typer.Option(
        str(LSF_LOG_DIR),
        "--output-dir",
        help="Directory for LSF log files. Defaults to buildtest/lsf_logs"
    ),
    test_filter: str = typer.Option(
        "test_matpool[0]",
        "-k",
        help="Filter for specific tests to run (e.g., 'test_matpool[0]')"
    ),
    hwtest: bool = typer.Option(
        False,
        "--hwtest",
        help="Run tests on hardware instead of simulation"
    ),
    target: BuildTarget = typer.Option(
        BuildTarget.SIM,
        "--target",
        help="Build target."
    ),
    output_root: str = typer.Option(
        None,
        "--output-root",
        help="Output root directory for pytest --output-root (supports {{worker_id}} template)"
    ),
    model_compilation: bool = typer.Option(
        False,
        "--model-compilation",
        help="Run model compilation tests instead of buildtest tests"
    ),
    with_model_data: bool = typer.Option(
        False,
        "--with-model-data",
        help="Include model data (DataGen) in compilation (requires --model-compilation)"
    ),
    model_filter: str = typer.Option(
        None,
        "--model-filter",
        help="Filter models by name pattern (glob-style, e.g., 'psd*' or 'Intel_bert')"
    )
):
    """
    AIE4 Cron Job - Automated Testing Script

    Submits all tests to LSF, waits for completion, and generates report.

    For model compilation tests, use --model-compilation flag:
        python run_lsf_tests.py --model-compilation

    For model compilation with model data:
        python run_lsf_tests.py --model-compilation --with-model-data
    """
    exit_code = 0

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    incomplete_table = ""
    failed_jobs_table = ""
    submitted_jobs = []
    incomplete_jobs = []
    captured_logs = []  # Logs captured before killing jobs

    try:
        # Submit tests using pytest_lsf functions
        logger.info("Submitting tests with job prefix: %s", job_prefix)
        logger.info("Output directory: %s", output_dir)
        logger.info("Mode: %s", "model-compilation" if model_compilation else "buildtest")

        if model_compilation:
            # Submit model compilation tests (no target needed)
            mode_str = "with-model-data" if with_model_data else "standard"
            logger.info("Running model compilation tests (mode: %s)...", mode_str)
            if model_filter:
                logger.info("Model filter: %s", model_filter)
            submit_exit_code, submitted_jobs = submit_model_compilation_tests(
                job_prefix, output_dir, with_model_data=with_model_data, model_filter=model_filter
            )
        else:
            # Submit buildtest tests (original behavior)
            logger.info("Target: %s", target.value)
            logger.info("Hardware testing: %s", hwtest)
            if output_root:
                logger.info("Output root: %s", output_root)
            submit_exit_code, submitted_jobs = submit_all_tests(
                job_prefix, output_dir, test_filter, target, output_root, hwtest
            )

        if submit_exit_code != 0:
            logger.warning("pytest_lsf.py exited with code %d", submit_exit_code)

        # Wait for completion
        success, incomplete_table, incomplete_jobs, captured_logs = wait_for_jobs(submitted_jobs)

        if not success:
            exit_code = 1
            # captured_logs already has logs from timeout (captured before killing)
        else:
            # Check for job failures by examining LSF output files
            all_passed, failed_jobs_table = check_job_failures(submitted_jobs)
            if not all_passed:
                logger.error("Some jobs failed")
                exit_code = 1
                incomplete_table = failed_jobs_table  # Use failed jobs table for report
                # Get failed jobs for log extraction
                incomplete_jobs = [
                    job for job in submitted_jobs
                    if not (job.out_file.exists() and
                            "Successfully completed" in read_last_lines(job.out_file, num_lines=50))
                ]
                # Capture logs from failed jobs (not killed, so reading now is safe)
                captured_logs = get_incomplete_job_logs(incomplete_jobs)
            else:
                # Check for DI_PASS markers (missing means test failed)
                all_di_passed, di_pass_failures = check_di_pass(submitted_jobs)
                if not all_di_passed:
                    logger.error("Some tests missing DI_PASS marker:")
                    logger.error("\n%s", di_pass_failures)
                    exit_code = 1
                else:
                    logger.info("SUCCESS: All tests have DI_PASS marker")

    except KeyboardInterrupt:
        logger.warning("Interrupted by user (Ctrl+C)")
        exit_code = 130  # Standard exit code for SIGINT (128 + 2)
        # Kill any running jobs
        if submitted_jobs:
            job_ids = [job.job_id for job in submitted_jobs]
            logger.info("Killing %d submitted job(s)...", len(job_ids))
            subprocess.run(["bkill"] + job_ids, check=False)

    except (OSError, subprocess.SubprocessError, RuntimeError) as e:
        logger.error("Error during execution: %s", e)
        traceback.print_exc()
        exit_code = 1

    finally:
        # Generate and send report
        logger.info("Generating report...")
        markdown_content, html = generate_report(output_dir, job_prefix, exit_code, incomplete_table, submitted_jobs, captured_logs)

        # Save reports to files
        report_file = output_dir / f"{job_prefix}_report.html"
        markdown_file = output_dir / f"{job_prefix}_report.md"
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(html)
        with open(markdown_file, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        logger.info("HTML report saved to: %s", report_file)
        logger.info("Markdown report saved to: %s", markdown_file)

    return exit_code


if __name__ == "__main__":
    typer.run(main)
