diff --git a/papermill/__init__.py b/papermill/__init__.py index 7e691ccf..6f22a34c 100644 --- a/papermill/__init__.py +++ b/papermill/__init__.py @@ -1,4 +1,5 @@ from .exceptions import PapermillException, PapermillExecutionError # noqa: F401 from .execute import execute_notebook # noqa: F401 from .inspection import inspect_notebook # noqa: F401 +from .profile import profile_notebook, build_profile, build_sections # noqa: F401 from .version import version as __version__ # noqa: F401 diff --git a/papermill/cli.py b/papermill/cli.py index 0c1b54d0..6771e568 100755 --- a/papermill/cli.py +++ b/papermill/cli.py @@ -15,6 +15,7 @@ from .execute import execute_notebook from .inspection import display_notebook_help from .iorw import NoDatesSafeLoader, read_yaml_file +from .profile import profile_notebook from .version import version as papermill_version click.disable_unicode_literals_warning = True @@ -95,6 +96,14 @@ def print_papermill_version(ctx, param, value): ) @click.option('--cwd', default=None, help='Working directory to run notebook in.') @click.option('--progress-bar/--no-progress-bar', default=None, help="Flag for turning on the progress bar.") +@click.option( + '--live-tree/--no-live-tree', + default=False, + help=( + "Show a live Rich tree of notebook sections and per-cell timing during execution, " + "replacing the tqdm progress bar. Requires: pip install 'papermill[rich]'." + ), +) @click.option( '--log-output/--no-log-output', default=False, @@ -158,6 +167,7 @@ def papermill( language, cwd, progress_bar, + live_tree, log_output, log_level, start_timeout, @@ -250,6 +260,7 @@ def papermill( report_mode=report_mode, cwd=cwd, execution_timeout=execution_timeout, + live_tree=live_tree, ) except nbclient.exceptions.DeadKernelError: # Exiting with a special exit code for dead kernels @@ -257,6 +268,54 @@ def papermill( sys.exit(138) +@click.command('profile', context_settings=dict(help_option_names=['-h', '--help'])) +@click.argument('notebook_path') +@click.option( + '--output', + '-o', + default=None, + help='Path to write profile JSON (default: .profile.json).', +) +def papermill_profile(notebook_path, output): + """Profile an already-executed notebook and print a timing summary. + + NOTEBOOK_PATH must be an executed .ipynb file that contains papermill + timing metadata (i.e. it was run via ``papermill`` or + ``execute_notebook``). + + Writes a JSON report with per-section and per-cell durations, output + types, bottleneck identification, and the five slowest cells. + """ + import json + from pathlib import Path + + out_path = output or str(Path(notebook_path).with_suffix('.profile.json')) + profile = profile_notebook(notebook_path, output=out_path) + + click.echo(f"\nNotebook : {profile['notebook']}") + click.echo(f"Total : {profile.get('total_duration_s', '—')}s") + click.echo(f"Cells : {profile['n_code_cells']} code | Errors: {profile['n_errors']}") + + if profile.get('bottleneck'): + b = profile['bottleneck'] + click.echo(f"Bottleneck: [{b['cell_index']}] in «{b['section']}» — {b['duration_s']}s ({b['pct_of_total']}%)") + + click.echo("\nSections:") + for s in profile['sections']: + indent = " " * s['level'] + click.echo(f" {indent}{s['label']:<40} {s['duration_s']:.3f}s") + + if profile.get('slowest_cells'): + click.echo("\nSlowest cells:") + for c in profile['slowest_cells']: + click.echo( + f" [{c['index']}] {c['source_preview'][:50]:<52} " + f"{c['duration_s']}s {','.join(c['output_types']) or '—'}" + ) + + click.echo(f"\nProfile written to: {out_path}") + + def _resolve_type(value): if value == "True": return True diff --git a/papermill/engines.py b/papermill/engines.py index c44d548d..549381de 100644 --- a/papermill/engines.py +++ b/papermill/engines.py @@ -96,7 +96,9 @@ class NotebookExecutionManager: COMPLETED = "completed" FAILED = "failed" - def __init__(self, nb, output_path=None, log_output=False, progress_bar=True, autosave_cell_every=30): + def __init__( + self, nb, output_path=None, log_output=False, progress_bar=True, autosave_cell_every=30, live_display=None + ): self.nb = nb self.output_path = output_path self.log_output = log_output @@ -105,6 +107,7 @@ def __init__(self, nb, output_path=None, log_output=False, progress_bar=True, au self.autosave_cell_every = autosave_cell_every self.max_autosave_pct = 25 self.last_save_time = self.now() # Not exactly true, but simplifies testing logic + self.live_display = live_display # optional LiveTreeDisplay — replaces tqdm when set self.pbar = None if progress_bar: # lazy import due to implicit slow ipython import @@ -227,10 +230,14 @@ def cell_start(self, cell, cell_index=None, **kwargs): cell.metadata.papermill["status"] = self.RUNNING cell.metadata.papermill['exception'] = False - # injects optional description of the current cell directly in the tqdm - cell_description = self.get_cell_description(cell) - if cell_description is not None and hasattr(self, 'pbar') and self.pbar: - self.pbar.set_description(f"Executing {cell_description}") + if self.live_display is not None: + if cell_index is not None: + self.live_display.on_cell_start(cell_index) + else: + # injects optional description of the current cell directly in the tqdm + cell_description = self.get_cell_description(cell) + if cell_description is not None and hasattr(self, 'pbar') and self.pbar: + self.pbar.set_description(f"Executing {cell_description}") self.save() @@ -246,6 +253,8 @@ def cell_exception(self, cell, cell_index=None, **kwargs): cell.metadata.papermill['exception'] = True cell.metadata.papermill['status'] = self.FAILED self.nb.metadata.papermill['exception'] = True + if self.live_display is not None and cell_index is not None: + self.live_display.on_cell_exception(cell_index) @catch_nb_assignment def cell_complete(self, cell, cell_index=None, **kwargs): @@ -272,7 +281,10 @@ def cell_complete(self, cell, cell_index=None, **kwargs): cell.metadata.papermill['status'] = self.COMPLETED self.save() - if self.pbar: + if self.live_display is not None: + if cell_index is not None: + self.live_display.on_cell_complete(self.nb.cells[cell_index], cell_index) + elif self.pbar: self.pbar.update(1) @catch_nb_assignment @@ -348,6 +360,7 @@ def execute_notebook( progress_bar=True, log_output=False, autosave_cell_every=30, + live_display=None, **kwargs, ): """ @@ -364,6 +377,7 @@ def execute_notebook( progress_bar=progress_bar, log_output=log_output, autosave_cell_every=autosave_cell_every, + live_display=live_display, ) nb_man.notebook_start() diff --git a/papermill/execute.py b/papermill/execute.py index eb76cc34..aace0b68 100644 --- a/papermill/execute.py +++ b/papermill/execute.py @@ -27,6 +27,7 @@ def execute_notebook( start_timeout=60, report_mode=False, cwd=None, + live_tree=False, **engine_kwargs, ): """Executes a single notebook locally. @@ -61,6 +62,9 @@ def execute_notebook( Flag for whether or not to hide input. cwd : str or Path, optional Working directory to use when executing the notebook + live_tree : bool, optional + Show a Rich live tree of sections and per-cell timing instead of the + default tqdm progress bar. Requires ``pip install 'papermill[rich]'``. **kwargs Arbitrary keyword arguments to pass to the notebook engine @@ -111,21 +115,46 @@ def execute_notebook( if not prepare_only: # Dropdown to the engine to fetch the kernel name from the notebook document kernel_name = papermill_engines.nb_kernel_name(engine_name=engine_name, nb=nb, name=kernel_name) + + # Resolve live_tree: if requested, disable tqdm and attach the Rich display + _live_display = None + if live_tree: + from .live_tree import LiveTreeDisplay, is_available as _rich_ok + + if _rich_ok(): + import os + + nb_name = os.path.basename(input_path) if isinstance(input_path, str) else "notebook.ipynb" + _live_display = LiveTreeDisplay(nb, nb_name) + progress_bar = False # Rich tree replaces tqdm + else: + logger.warning( + "live_tree=True requested but 'rich' is not installed. " + "Falling back to tqdm. Install with: pip install 'papermill[rich]'" + ) + # Execute the Notebook in `cwd` if it is set with chdir(cwd): - nb = papermill_engines.execute_notebook_with_engine( - engine_name, - nb, - input_path=input_path, - output_path=output_path if request_save_on_cell_execute else None, - kernel_name=kernel_name, - progress_bar=progress_bar, - log_output=log_output, - start_timeout=start_timeout, - stdout_file=stdout_file, - stderr_file=stderr_file, - **engine_kwargs, - ) + if _live_display is not None: + _live_display.start() + try: + nb = papermill_engines.execute_notebook_with_engine( + engine_name, + nb, + input_path=input_path, + output_path=output_path if request_save_on_cell_execute else None, + kernel_name=kernel_name, + progress_bar=progress_bar, + log_output=log_output, + start_timeout=start_timeout, + stdout_file=stdout_file, + stderr_file=stderr_file, + live_display=_live_display, + **engine_kwargs, + ) + finally: + if _live_display is not None: + _live_display.stop() # Check for errors first (it saves on error before raising) raise_for_execution_errors(nb, output_path) diff --git a/papermill/live_tree.py b/papermill/live_tree.py new file mode 100644 index 00000000..ec57ab1a --- /dev/null +++ b/papermill/live_tree.py @@ -0,0 +1,214 @@ +""" +papermill.live_tree +=================== +Optional Rich-based live tree display for notebook execution progress. + +Shows a real-time tree of notebook sections and cells with per-cell timing, +replacing the default tqdm progress bar when ``rich`` is installed and +``--live-tree`` / ``live_tree=True`` is requested. + +Requires the ``rich`` extra:: + + pip install 'papermill[rich]' + +Tree output during execution:: + + notebook.ipynb 6/10 cells 60% + ├── ✓ Section 1 0.1s + │ ├── ✓ [1] import numpy as np 0.1s + │ └── ✓ [2] data = np.zeros((n, m)) 0.0s + ├── ⟳ Section 2 4.2s… + │ └── ⟳ [4] model.fit(X, y) 4.2s… + └── · Section 3 pending + +The display hooks into :class:`~papermill.engines.NotebookExecutionManager` +via ``cell_start``, ``cell_complete``, and ``cell_exception`` callbacks and +is activated automatically when ``live_tree=True`` is passed to +:func:`~papermill.execute.execute_notebook`. +""" + +from __future__ import annotations + +import time +import threading +from typing import TYPE_CHECKING + +from .profile import build_sections, CellProfile, SectionProfile + +if TYPE_CHECKING: + import nbformat + +__all__ = ["LiveTreeDisplay", "is_available"] + + +def is_available() -> bool: + """Return True if the ``rich`` package is installed.""" + try: + import rich # noqa: F401 + + return True + except ImportError: + return False + + +# ── Glyph / colour helpers ──────────────────────────────────────────────────── + +_GLYPH = { + "pending": ("·", "dim"), + "running": ("⟳", "yellow bold"), + "completed": ("✓", "green"), + "failed": ("✗", "red bold"), + "skipped": ("—", "dim"), +} + + +class LiveTreeDisplay: + """ + Rich live tree display for papermill notebook execution. + + Instantiate before execution and call :meth:`attach` to register hooks + on a :class:`~papermill.engines.NotebookExecutionManager`. + + Parameters + ---------- + nb : nbformat.NotebookNode + The notebook *before* execution starts (used to build the section tree). + nb_name : str + Display name shown at the top of the tree (typically the filename). + refresh_per_second : int + How often the live display refreshes. Default 4 Hz. + + Examples + -------- + Papermill calls this automatically when ``live_tree=True``; you should not + normally need to instantiate it yourself. + """ + + def __init__(self, nb: "nbformat.NotebookNode", nb_name: str, refresh_per_second: int = 4): + if not is_available(): + raise ImportError( + "The 'rich' package is required for live tree display. Install it with: pip install 'papermill[rich]'" + ) + from rich.console import Console + from rich.live import Live + + self._sections: list[SectionProfile] = build_sections(nb) + self._cell_map: dict[int, CellProfile] = {cp.index: cp for sec in self._sections for cp in sec.cells} + self._nb_name = nb_name + self._n_total = sum(1 for c in nb.cells if c.cell_type == "code") + self._n_done = 0 + self._lock = threading.Lock() + + self._console = Console(highlight=False) + self._live = Live(console=self._console, refresh_per_second=refresh_per_second, transient=False) + + # ── Tree builder ────────────────────────────────────────────────────────── + + def _build_tree(self): + from rich.text import Text + from rich.tree import Tree + + pct = 100 * self._n_done / self._n_total if self._n_total else 0 + header = f"[bold]{self._nb_name}[/] [dim]{self._n_done}/{self._n_total} cells {pct:.0f}%[/]" + root = Tree(header, guide_style="dim") + level_nodes: dict = {0: root} + + for sec in self._sections: + parent_level = sec.level - 1 if sec.level > 0 else 0 + parent_node = level_nodes.get(parent_level, root) + + code_cells = [c for c in sec.cells if c.cell_type == "code"] + sec_status = sec.status + is_running = sec_status == "running" + live_start = ( + next( + (c._live_start for c in code_cells if getattr(c, "_live_start", None) and c.status == "running"), + None, + ) + if is_running + else None + ) + + glyph_ch, glyph_style = _GLYPH.get(sec_status, ("?", "dim")) + glyph = Text(glyph_ch, style=glyph_style) + dur_t = self._dur_text(sec.duration_s if sec.duration_s > 0 else None, is_running, live_start) + + sec_label = Text() + sec_label.append_text(glyph) + sec_label.append(f" {sec.display_label:<38}", style="bold" if is_running else "") + sec_label.append(" ") + sec_label.append_text(dur_t) + + sec_node = parent_node.add(sec_label) + level_nodes[sec.level] = sec_node + for lvl in [k for k in level_nodes if k > sec.level]: + del level_nodes[lvl] + + for cp in code_cells: + r = cp.status == "running" + g_ch, g_st = _GLYPH.get(cp.status, ("?", "dim")) + g = Text(g_ch, style=g_st) + d = self._dur_text(cp.duration_s, r, getattr(cp, "_live_start", None)) + preview = cp.source_preview[:45] + cell_label = Text() + cell_label.append_text(g) + cell_label.append(f" [{cp.index}] {preview:<45}", style="yellow" if r else "dim") + cell_label.append(" ") + cell_label.append_text(d) + sec_node.add(cell_label) + + return root + + @staticmethod + def _dur_text(dur, running: bool = False, live_start=None): + from rich.text import Text + + if running and live_start is not None: + elapsed = time.monotonic() - live_start + return Text(f"{elapsed:.1f}s…", style="yellow") + if dur is not None: + style = "red" if dur > 30 else ("yellow" if dur > 5 else "green") + return Text(f"{dur:.1f}s", style=style) + return Text("", style="dim") + + def _refresh(self): + with self._lock: + self._live.update(self._build_tree()) + + # ── Lifecycle ───────────────────────────────────────────────────────────── + + def start(self): + """Start the live display context.""" + self._live.start() + self._refresh() + + def stop(self): + """Stop the live display context.""" + self._refresh() + self._live.stop() + + # ── Callbacks (called by NotebookExecutionManager hooks) ────────────────── + + def on_cell_start(self, cell_index: int) -> None: + if cell_index in self._cell_map: + cp = self._cell_map[cell_index] + cp.status = "running" + cp._live_start = time.monotonic() + self._refresh() + + def on_cell_complete(self, cell, cell_index: int) -> None: + if cell_index in self._cell_map: + cp = self._cell_map[cell_index] + pm = cell.get("metadata", {}).get("papermill", {}) + cp.status = pm.get("status", "completed") + cp.exception = bool(pm.get("exception", False)) + cp.duration_s = pm.get("duration") + if cell.cell_type == "code": + with self._lock: + self._n_done += 1 + self._refresh() + + def on_cell_exception(self, cell_index: int) -> None: + if cell_index in self._cell_map: + self._cell_map[cell_index].status = "failed" + self._refresh() diff --git a/papermill/profile.py b/papermill/profile.py new file mode 100644 index 00000000..655b92bd --- /dev/null +++ b/papermill/profile.py @@ -0,0 +1,399 @@ +""" +papermill.profile +================= +Notebook profiling: per-cell timing, output analysis, and section grouping. + +Public API +---------- +profile_notebook(notebook_path, output=None) + Profile an already-executed notebook and return a dict. + +build_sections(nb) + Parse a notebook's markdown headings into SectionProfile objects. + +build_profile(notebook_path, nb) + Build a profile dict from an nbformat node. + +The profile dict schema:: + + { + "notebook": "path/to/notebook.ipynb", + "total_duration_s": 42.1, + "n_cells": 10, + "n_code_cells": 8, + "n_errors": 0, + "notebook_start": "2026-...", + "notebook_end": "2026-...", + "sections": [ + { + "label": "Section 1", # auto-generated sequential number + "title": "Data Loading", # original markdown heading text + "level": 1, + "number": "1", + "duration_s": 2.3, + "status": "completed", + "cells": [ + { + "index": 2, + "execution_count": 3, + "source_preview": "import pandas as pd ...", + "cell_type": "code", + "duration_s": 0.12, + "status": "completed", + "exception": false, + "output_types": ["execute_result"], + "output_size_chars": 80, + "has_plot": false, + "n_outputs": 1, + "stdout_chars": 0, + "stderr_chars": 0 + } + ] + } + ], + "slowest_cells": [...], + "bottleneck": { + "cell_index": 5, + "section": "Sub-section 2.1", + "duration_s": 18.0, + "pct_of_total": 42.8 + } + } + +Section numbering follows the markdown heading depth: + ``#`` → ``Section 1``, ``Section 2``, … + ``##`` → ``Sub-section 1.1``, ``Sub-section 1.2``, … + ``###``→ ``Sub-section 1.1.1``, … +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +import nbformat + +__all__ = [ + "CellProfile", + "SectionProfile", + "build_sections", + "build_profile", + "profile_notebook", +] + +_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)", re.MULTILINE) + + +# ── Data classes ────────────────────────────────────────────────────────────── + + +@dataclass +class CellProfile: + """Timing and output metadata for a single notebook cell.""" + + index: int + execution_count: Optional[int] + source_preview: str + cell_type: str + status: str = "pending" + exception: bool = False + duration_s: Optional[float] = None + output_types: list = field(default_factory=list) + output_size_chars: int = 0 + has_plot: bool = False + n_outputs: int = 0 + stdout_chars: int = 0 + stderr_chars: int = 0 + start_time: Optional[str] = None + end_time: Optional[str] = None + + def to_dict(self) -> dict: + return { + "index": self.index, + "execution_count": self.execution_count, + "source_preview": self.source_preview, + "cell_type": self.cell_type, + "status": self.status, + "exception": self.exception, + "duration_s": round(self.duration_s, 4) if self.duration_s is not None else None, + "output_types": self.output_types, + "output_size_chars": self.output_size_chars, + "has_plot": self.has_plot, + "n_outputs": self.n_outputs, + "stdout_chars": self.stdout_chars, + "stderr_chars": self.stderr_chars, + } + + +@dataclass +class SectionProfile: + """A section of cells bounded by a markdown heading.""" + + title: str # original markdown heading text + level: int + number: str = "" # e.g. "1", "1.2", "1.2.3" + cells: list = field(default_factory=list) + + @property + def display_label(self) -> str: + """Human-readable sequential label (e.g. 'Section 1', 'Sub-section 1.2').""" + if not self.number: + return self.title + depth = self.number.count(".") + 1 + return f"Section {self.number}" if depth == 1 else f"Sub-section {self.number}" + + @property + def duration_s(self) -> float: + return sum(c.duration_s or 0.0 for c in self.cells) + + @property + def status(self) -> str: + statuses = {c.status for c in self.cells if c.cell_type == "code"} + if "failed" in statuses: + return "failed" + if "running" in statuses: + return "running" + if "pending" in statuses: + return "pending" + return "completed" if statuses else "skipped" + + def to_dict(self) -> dict: + return { + "label": self.display_label, + "title": self.title, + "level": self.level, + "number": self.number, + "duration_s": round(self.duration_s, 4), + "status": self.status, + "cells": [c.to_dict() for c in self.cells], + } + + +# ── Section parser ──────────────────────────────────────────────────────────── + + +def _heading_level(source: str): + """Return (depth, title) if *source* starts with a markdown heading.""" + m = _HEADING_RE.match(source.strip()) + return (len(m.group(1)), m.group(2).strip()) if m else None + + +def _source_preview(source: str, max_chars: int = 60) -> str: + first = source.strip().split("\n")[0] + return first[:max_chars] + ("…" if len(first) > max_chars else "") + + +def build_sections(nb: nbformat.NotebookNode) -> list: + """ + Group notebook cells into :class:`SectionProfile` objects by markdown headings. + + Cells before the first heading collect in an implicit preamble section. + Section numbers are assigned sequentially per heading depth so the tree + carries no text from the notebook source code:: + + # → Section 1, Section 2, … + ## → Sub-section 1.1, Sub-section 1.2, … + + Parameters + ---------- + nb : nbformat.NotebookNode + + Returns + ------- + list[SectionProfile] + """ + sections = [] + current = SectionProfile(title="[preamble]", level=0, number="") + counters: dict = {} + + def _next_number(level: int) -> str: + for deeper in [k for k in counters if k > level]: + del counters[deeper] + counters[level] = counters.get(level, 0) + 1 + return ".".join(str(counters[lvl]) for lvl in sorted(counters)) + + for i, cell in enumerate(nb.cells): + source = cell.get("source", "") + cp = CellProfile( + index=i, + execution_count=cell.get("execution_count"), + source_preview=_source_preview(source), + cell_type=cell.cell_type, + ) + if cell.cell_type == "markdown": + heading = _heading_level(source) + if heading: + level, title = heading + if current.cells or current.title != "[preamble]": + sections.append(current) + current = SectionProfile(title=title, level=level, number=_next_number(level)) + continue + current.cells.append(cp) + + if current.cells: + sections.append(current) + return sections + + +# ── Output analysis ─────────────────────────────────────────────────────────── + + +def _analyze_outputs(cell: nbformat.NotebookNode) -> dict: + outputs = cell.get("outputs", []) + types, size, plot, stdout_c, stderr_c = [], 0, False, 0, 0 + for out in outputs: + ot = out.get("output_type", "") + types.append(ot) + if ot == "stream": + text = "".join(out.get("text", [])) + size += len(text) + if out.get("name") == "stdout": + stdout_c += len(text) + else: + stderr_c += len(text) + elif ot in ("execute_result", "display_data"): + data = out.get("data", {}) + if "image/png" in data or "image/svg+xml" in data: + plot = True + size += len(str(data.get("text/plain", ""))) + elif ot == "error": + size += len(out.get("evalue", "")) + len("\n".join(out.get("traceback", []))) + return { + "output_types": types, + "output_size_chars": size, + "has_plot": plot, + "n_outputs": len(outputs), + "stdout_chars": stdout_c, + "stderr_chars": stderr_c, + } + + +def _populate_from_executed( + nb: nbformat.NotebookNode, + sections: list, + cell_map: dict, +) -> None: + """Fill CellProfile fields from papermill metadata stored in an executed notebook.""" + for i, cell in enumerate(nb.cells): + if i not in cell_map: + continue + cp = cell_map[i] + pm = cell.get("metadata", {}).get("papermill", {}) + cp.status = pm.get("status", "completed") + cp.exception = bool(pm.get("exception", False)) + cp.duration_s = pm.get("duration") + cp.start_time = pm.get("start_time") + cp.end_time = pm.get("end_time") + cp.execution_count = cell.get("execution_count") + out = _analyze_outputs(cell) + cp.output_types = out["output_types"] + cp.output_size_chars = out["output_size_chars"] + cp.has_plot = out["has_plot"] + cp.n_outputs = out["n_outputs"] + cp.stdout_chars = out["stdout_chars"] + cp.stderr_chars = out["stderr_chars"] + + +# ── Profile builder ─────────────────────────────────────────────────────────── + + +def build_profile(notebook_path: str, nb: nbformat.NotebookNode) -> dict: + """ + Build a profile dict from an executed notebook node. + + Parameters + ---------- + notebook_path : str + Path to the notebook file (used only for the ``"notebook"`` key). + nb : nbformat.NotebookNode + The executed notebook (papermill metadata must be present for timing). + + Returns + ------- + dict + Profile report — see module docstring for the full schema. + """ + sections = build_sections(nb) + cell_map = {cp.index: cp for sec in sections for cp in sec.cells} + _populate_from_executed(nb, sections, cell_map) + + pm_nb = nb.get("metadata", {}).get("papermill", {}) + all_code = [cp for sec in sections for cp in sec.cells if cp.cell_type == "code"] + sorted_dur = sorted( + [c for c in all_code if c.duration_s is not None], + key=lambda c: c.duration_s or 0, + reverse=True, + ) + total_dur = pm_nb.get("duration") or sum(c.duration_s or 0 for c in all_code) + n_errors = sum(1 for c in all_code if c.exception) + + def _label(idx: int) -> str: + return next((s.display_label for s in sections if any(x.index == idx for x in s.cells)), "?") + + slowest = [{**c.to_dict(), "section": _label(c.index)} for c in sorted_dur[:5]] + + bottleneck = None + if sorted_dur: + b = sorted_dur[0] + bottleneck = { + "cell_index": b.index, + "section": _label(b.index), + "duration_s": round(b.duration_s, 4), + "pct_of_total": round(100 * b.duration_s / total_dur, 1) if total_dur else 0, + } + + return { + "notebook": str(notebook_path), + "total_duration_s": round(total_dur, 4) if total_dur else None, + "n_cells": len(nb.cells), + "n_code_cells": len(all_code), + "n_errors": n_errors, + "notebook_start": pm_nb.get("start_time"), + "notebook_end": pm_nb.get("end_time"), + "sections": [s.to_dict() for s in sections], + "slowest_cells": slowest, + "bottleneck": bottleneck, + } + + +# ── Public API ──────────────────────────────────────────────────────────────── + + +def profile_notebook(notebook_path: str, output: Optional[str] = None) -> dict: + """ + Profile an already-executed notebook and return the profile as a dict. + + Parameters + ---------- + notebook_path : str + Path to an executed ``.ipynb`` file (must contain papermill metadata). + output : str, optional + If given, write the profile JSON to this path. + + Returns + ------- + dict + Profile report — see module docstring for the full schema. + + Examples + -------- + >>> from papermill.profile import profile_notebook + >>> profile = profile_notebook("executed_notebook.ipynb") + >>> print(profile["total_duration_s"]) + 42.1 + >>> print(profile["bottleneck"]) + {'cell_index': 5, 'section': 'Sub-section 2.1', 'duration_s': 18.0, 'pct_of_total': 42.8} + """ + with open(notebook_path) as f: + nb = nbformat.read(f, as_version=4) + + result = build_profile(notebook_path, nb) + + if output: + with open(output, "w") as f: + json.dump(result, f, indent=2) + + return result diff --git a/papermill/tests/test_profile.py b/papermill/tests/test_profile.py new file mode 100644 index 00000000..6113e249 --- /dev/null +++ b/papermill/tests/test_profile.py @@ -0,0 +1,275 @@ +"""Tests for papermill.profile and papermill.live_tree.""" + +import json +import pytest +import nbformat + +from papermill.profile import ( + CellProfile, + SectionProfile, + build_sections, + build_profile, + profile_notebook, +) + + +# ── Fixtures ────────────────────────────────────────────────────────────────── + + +def _make_nb(cells): + """Build a minimal NotebookNode from a list of (type, source, pm_meta) tuples.""" + nb = nbformat.v4.new_notebook() + nb.cells = [] + for cell_type, source, pm_meta in cells: + if cell_type == "markdown": + cell = nbformat.v4.new_markdown_cell(source) + else: + cell = nbformat.v4.new_code_cell(source) + cell.metadata["papermill"] = pm_meta or { + "start_time": None, + "end_time": None, + "duration": None, + "status": "pending", + "exception": False, + } + nb.cells.append(cell) + return nb + + +def _executed_cell_meta(duration, status="completed", exception=False): + return { + "start_time": "2026-01-01T00:00:00+00:00", + "end_time": "2026-01-01T00:00:01+00:00", + "duration": duration, + "status": status, + "exception": exception, + } + + +# ── build_sections ──────────────────────────────────────────────────────────── + + +class TestBuildSections: + def test_no_headings_creates_preamble(self): + nb = _make_nb([("code", "x = 1", None), ("code", "y = 2", None)]) + sections = build_sections(nb) + assert len(sections) == 1 + assert sections[0].title == "[preamble]" + assert sections[0].number == "" + assert len(sections[0].cells) == 2 + + def test_single_heading(self): + nb = _make_nb( + [ + ("markdown", "# Data Loading", None), + ("code", "import pandas", None), + ] + ) + sections = build_sections(nb) + assert len(sections) == 1 + assert sections[0].number == "1" + assert sections[0].display_label == "Section 1" + + def test_sequential_numbering(self): + nb = _make_nb( + [ + ("markdown", "# First", None), + ("code", "a=1", None), + ("markdown", "# Second", None), + ("code", "b=2", None), + ("markdown", "# Third", None), + ("code", "c=3", None), + ] + ) + sections = build_sections(nb) + assert [s.number for s in sections] == ["1", "2", "3"] + assert [s.display_label for s in sections] == ["Section 1", "Section 2", "Section 3"] + + def test_nested_sub_sections(self): + nb = _make_nb( + [ + ("markdown", "# Analysis", None), + ("code", "x=1", None), + ("markdown", "## Cleaning", None), + ("code", "y=2", None), + ("markdown", "## Feature Engineering", None), + ("code", "z=3", None), + ("markdown", "# Results", None), + ("code", "w=4", None), + ] + ) + sections = build_sections(nb) + labels = [s.display_label for s in sections] + assert labels == ["Section 1", "Sub-section 1.1", "Sub-section 1.2", "Section 2"] + + def test_sub_section_counter_resets_across_top_sections(self): + nb = _make_nb( + [ + ("markdown", "# A", None), + ("code", "a=1", None), + ("markdown", "## A1", None), + ("code", "b=2", None), + ("markdown", "# B", None), + ("code", "c=3", None), + ("markdown", "## B1", None), + ("code", "d=4", None), + ] + ) + sections = build_sections(nb) + numbers = [s.number for s in sections] + # After # B the sub-counter resets, so ## B1 becomes 2.1 not 1.2 + assert numbers == ["1", "1.1", "2", "2.1"] + + def test_heading_cells_not_added_to_cell_list(self): + nb = _make_nb( + [ + ("markdown", "# Title", None), + ("markdown", "Some prose (no heading)", None), + ("code", "x=1", None), + ] + ) + sections = build_sections(nb) + assert len(sections) == 1 + # Only the prose markdown + code cell should be in cells + assert len(sections[0].cells) == 2 + + +# ── SectionProfile ──────────────────────────────────────────────────────────── + + +class TestSectionProfile: + def test_display_label_preamble(self): + s = SectionProfile(title="[preamble]", level=0, number="") + assert s.display_label == "[preamble]" + + def test_display_label_section(self): + s = SectionProfile(title="Data", level=1, number="3") + assert s.display_label == "Section 3" + + def test_display_label_subsection(self): + s = SectionProfile(title="Cleaning", level=2, number="1.2") + assert s.display_label == "Sub-section 1.2" + + def test_duration_sums_cells(self): + s = SectionProfile(title="T", level=1, number="1") + s.cells = [ + CellProfile(0, 1, "", "code", duration_s=1.0), + CellProfile(1, 2, "", "code", duration_s=2.5), + ] + assert s.duration_s == pytest.approx(3.5) + + def test_status_failed_takes_priority(self): + s = SectionProfile(title="T", level=1, number="1") + s.cells = [ + CellProfile(0, 1, "", "code", status="completed"), + CellProfile(1, 2, "", "code", status="failed"), + ] + assert s.status == "failed" + + def test_to_dict_contains_label_and_title(self): + s = SectionProfile(title="Data Loading", level=1, number="1") + d = s.to_dict() + assert d["label"] == "Section 1" + assert d["title"] == "Data Loading" + assert d["number"] == "1" + + +# ── build_profile ───────────────────────────────────────────────────────────── + + +class TestBuildProfile: + def _make_executed_nb(self): + nb = _make_nb( + [ + ("markdown", "# Imports", None), + ("code", "import numpy as np", _executed_cell_meta(0.1)), + ("markdown", "## Heavy computation", None), + ("code", "result = np.sum(range(1000))", _executed_cell_meta(5.0)), + ("markdown", "# Results", None), + ("code", "print(result)", _executed_cell_meta(0.05)), + ] + ) + nb.metadata["papermill"] = { + "start_time": "2026-01-01T00:00:00+00:00", + "end_time": "2026-01-01T00:00:06+00:00", + "duration": 5.15, + "exception": False, + } + return nb + + def test_profile_keys(self): + nb = self._make_executed_nb() + profile = build_profile("test.ipynb", nb) + assert "notebook" in profile + assert "total_duration_s" in profile + assert "sections" in profile + assert "bottleneck" in profile + assert "slowest_cells" in profile + + def test_bottleneck_points_to_slowest_cell(self): + nb = self._make_executed_nb() + profile = build_profile("test.ipynb", nb) + assert profile["bottleneck"]["duration_s"] == pytest.approx(5.0) + assert "Sub-section" in profile["bottleneck"]["section"] + + def test_section_labels_in_profile(self): + nb = self._make_executed_nb() + profile = build_profile("test.ipynb", nb) + labels = [s["label"] for s in profile["sections"]] + assert "Section 1" in labels + assert "Sub-section 1.1" in labels + assert "Section 2" in labels + + def test_n_errors_zero_when_no_exceptions(self): + nb = self._make_executed_nb() + assert build_profile("test.ipynb", nb)["n_errors"] == 0 + + def test_n_errors_counts_exceptions(self): + nb = self._make_executed_nb() + nb.cells[1].metadata["papermill"]["exception"] = True + nb.cells[1].metadata["papermill"]["status"] = "failed" + assert build_profile("test.ipynb", nb)["n_errors"] == 1 + + +# ── profile_notebook ────────────────────────────────────────────────────────── + + +class TestProfileNotebook: + def test_returns_dict(self, tmp_path): + nb = nbformat.v4.new_notebook() + cell = nbformat.v4.new_code_cell("x = 1") + cell.metadata["papermill"] = _executed_cell_meta(0.1) + nb.cells = [cell] + nb.metadata["papermill"] = {"duration": 0.1, "exception": False} + nb_path = tmp_path / "test.ipynb" + nbformat.write(nb, str(nb_path)) + + profile = profile_notebook(str(nb_path)) + assert isinstance(profile, dict) + assert profile["n_code_cells"] == 1 + + def test_writes_json_file(self, tmp_path): + nb = nbformat.v4.new_notebook() + cell = nbformat.v4.new_code_cell("x = 1") + cell.metadata["papermill"] = _executed_cell_meta(0.2) + nb.cells = [cell] + nb.metadata["papermill"] = {"duration": 0.2, "exception": False} + nb_path = tmp_path / "test.ipynb" + out_path = tmp_path / "test.profile.json" + nbformat.write(nb, str(nb_path)) + + profile_notebook(str(nb_path), output=str(out_path)) + assert out_path.exists() + with open(out_path) as f: + data = json.load(f) + assert data["n_code_cells"] == 1 + + +# ── live_tree availability guard ────────────────────────────────────────────── + + +class TestLiveTreeAvailability: + def test_is_available_returns_bool(self): + from papermill.live_tree import is_available + + assert isinstance(is_available(), bool) diff --git a/pyproject.toml b/pyproject.toml index 99336a0a..9c714344 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,6 +108,7 @@ optional-dependencies.docs = [ optional-dependencies.gcs = [ "gcsfs>=0.2" ] optional-dependencies.github = [ "pygithub>=1.55" ] optional-dependencies.hdfs = [ "pyarrow>=2" ] +optional-dependencies.rich = [ "rich>=13" ] optional-dependencies.s3 = [ "boto3" ] optional-dependencies.test = [ "attrs>=17.4", @@ -147,6 +148,7 @@ urls.Funding = "https://nteract.io" urls.Source = "https://github.com/nteract/papermill/" urls.Tracker = "https://github.com/nteract/papermill/issues" scripts.papermill = "papermill.__main__:papermill" +scripts.papermill-profile = "papermill.cli:papermill_profile" [tool.setuptools] packages = [ "papermill" ]