Skip to content

dbx_patch.pth_processor

[docs] module dbx_patch.pth_processor

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
"""PTH File Processor for Editable Installs.

This module processes .pth files in site-packages directories to ensure
editable install paths are added to sys.path. This is critical because
Databricks runtime bypasses standard Python site.py initialization.

Supports:
- Legacy setuptools .egg-link files
- PEP 660 __editable_*.pth files
- Standard .pth files with directory paths
"""

import contextlib
import json
import os
from pathlib import Path
import sys
from typing import Any

from dbx_patch.models import PthProcessingResult

# Module-level cached logger
_logger: Any = None


def _get_logger() -> Any:
    """Get module-level cached logger instance."""
    global _logger
    if _logger is None:
        with contextlib.suppress(Exception):
            from dbx_patch.utils.logger import get_logger

            _logger = get_logger()
    return _logger


def get_site_packages_dirs() -> list[str]:
    """Get all site-packages and dist-packages directories from sys.path.

    Returns:
        List of absolute paths to site-packages directories that exist.
    """
    site_dirs = []
    for path in sys.path:
        if isinstance(path, str) and ("site-packages" in path or "dist-packages" in path):
            path_obj = Path(path)
            if path_obj.exists() and path_obj.is_dir():
                site_dirs.append(str(path_obj.resolve()))
    return list(dict.fromkeys(site_dirs))  # Remove duplicates while preserving order


def find_pth_files(site_packages_dir: str) -> list[str]:
    """Find all .pth files in a site-packages directory.

    Args:
        site_packages_dir: Path to site-packages directory

    Returns:
        List of absolute paths to .pth files
    """
    pth_files = []
    try:
        site_packages_path = Path(site_packages_dir)
        for entry in site_packages_path.iterdir():
            if entry.name.endswith(".pth") and entry.is_file():
                pth_files.append(str(entry))
    except (OSError, PermissionError) as e:
        logger = _get_logger()
        if logger:
            logger.warning(f"Could not scan {site_packages_dir}: {e}")
    return pth_files


def process_pth_file(pth_file_path: str) -> list[str]:
    """Process a single .pth file and extract directory paths.

    PTH files can contain:
    - Directory paths (one per line)
    - import statements (executed to install finders for PEP 660 editable installs)
    - Comments (lines starting with #)

    For modern PEP 660 editable installs, the .pth file contains import statements
    that register import hooks. We execute these statements to properly install
    the editable package.

    Args:
        pth_file_path: Path to the .pth file

    Returns:
        List of absolute directory paths found in the file
    """
    logger = _get_logger()
    if logger:
        logger.debug(f"Processing .pth file: {pth_file_path}")

    paths = []

    try:
        with Path(pth_file_path).open(encoding="utf-8") as f:
            for line in f:
                line = line.strip()

                # Skip empty lines and comments
                if not line or line.startswith("#"):
                    continue

                # Execute import statements (for PEP 660 editable installs)
                if line.startswith("import ") or line.startswith("__import__"):
                    try:
                        # Execute the import statement
                        # This typically installs a meta path finder for the editable package
                        if logger:
                            logger.debug(f"Executing .pth import statement: {line}")
                        # Using exec is necessary here to execute .pth file import hooks
                        # This is the standard behavior of Python's site.py
                        exec(line, {"__file__": pth_file_path, "__name__": "__sitecustomize__"})  # noqa: S102
                        if logger:
                            logger.debug(f"Successfully executed import from {Path(pth_file_path).name}")
                    except Exception as e:
                        if logger:
                            logger.warning(f"Failed to execute .pth import statement: {e}")
                    continue

                # Check if it's a valid directory path
                line_path = Path(line)
                if line_path.is_absolute():
                    abs_path = line_path.resolve()
                else:
                    abs_path = (Path(pth_file_path).parent / line).resolve()

                if abs_path.exists() and abs_path.is_dir():
                    paths.append(str(abs_path))
                    if logger:
                        logger.debug(f"Found editable path in .pth: {abs_path}")
                else:
                    if logger:
                        logger.debug(
                            f"Skipping invalid path in .pth: {line} (resolved to {abs_path}, exists={abs_path.exists()}, is_dir={abs_path.is_dir() if abs_path.exists() else 'N/A'})"
                        )
    except (OSError, UnicodeDecodeError) as e:
        if logger:
            logger.warning(f"Could not process {pth_file_path}: {e}")

    if logger:
        logger.debug(f"Total paths from {pth_file_path}: {len(paths)}")

    return paths


def find_egg_link_paths(site_packages_dir: str) -> list[str]:
    """Find paths from .egg-link files (legacy setuptools editable installs).

    Args:
        site_packages_dir: Path to site-packages directory

    Returns:
        List of absolute paths from .egg-link files
    """
    paths = []
    try:
        site_packages_path = Path(site_packages_dir)
        for entry in site_packages_path.iterdir():
            if entry.name.endswith(".egg-link"):
                try:
                    with entry.open() as f:
                        path = f.readline().strip()
                        if path:
                            path_obj = Path(path)
                            if path_obj.exists() and path_obj.is_dir():
                                paths.append(str(path_obj.resolve()))
                except OSError:
                    pass
    except (OSError, PermissionError):
        pass
    return paths


def detect_editable_installs_via_metadata() -> set[str]:
    """Detect editable installs via importlib.metadata (PEP 660 modern approach).

    Returns:
        Set of absolute paths to editable install directories
    """
    editable_paths = set()

    try:
        from importlib.metadata import distributions

        for dist in distributions():
            try:
                # Check for direct_url.json (PEP 660 and modern pip)
                if hasattr(dist, "read_text"):
                    direct_url_text = dist.read_text("direct_url.json")
                    if direct_url_text:
                        direct_url = json.loads(direct_url_text)
                        if direct_url.get("dir_info", {}).get("editable"):
                            url = direct_url.get("url", "")
                            if url.startswith("file://"):
                                path = url[7:]  # Remove 'file://'
                                path_obj = Path(path)
                                if path_obj.exists():
                                    editable_paths.add(str(path_obj.resolve()))
            except (FileNotFoundError, json.JSONDecodeError, AttributeError):
                continue
    except ImportError:
        pass

    return editable_paths


def add_paths_to_sys_path(paths: list[str], prepend: bool = False) -> int:
    """Add paths to sys.path if not already present.

    Args:
        paths: List of absolute directory paths
        prepend: If True, add to beginning of sys.path; otherwise append

    Returns:
        Number of paths actually added
    """
    logger = _get_logger()
    if logger:
        logger.debug(f"Adding {len(paths)} path(s) to sys.path (prepend={prepend})")

    added_count = 0
    existing_paths = set(sys.path)

    for path in paths:
        if path not in existing_paths:
            if prepend:
                sys.path.insert(0, path)
            else:
                sys.path.append(path)
            added_count += 1
            if logger:
                logger.debug(f"Added to sys.path: {path}")
        else:
            if logger:
                logger.debug(f"Already in sys.path: {path}")

    return added_count


def process_all_pth_files(force: bool = False) -> PthProcessingResult:
    """Process all .pth files in all site-packages directories.

    This is the main entry point for fixing editable install imports.

    Args:
        force: If True, re-add paths even if they're already in sys.path

    Returns:
        PthProcessingResult with processing details
    """
    site_dirs = get_site_packages_dirs()
    all_paths = []
    pth_files_count = 0
    egg_link_paths = []
    logger = _get_logger()

    if logger:
        logger.info(f"Scanning {len(site_dirs)} site-packages directories for editable installs...")

    # Process .pth files
    for site_dir in site_dirs:
        pth_files = find_pth_files(site_dir)
        pth_files_count += len(pth_files)

        for pth_file in pth_files:
            paths = process_pth_file(pth_file)
            all_paths.extend(paths)
            if paths and logger:
                with logger.indent():
                    logger.info(f"Found {len(paths)} path(s) in {Path(pth_file).name}")

        # Also check for .egg-link files
        egg_paths = find_egg_link_paths(site_dir)
        egg_link_paths.extend(egg_paths)
        all_paths.extend(egg_paths)

    # Also detect via importlib.metadata
    metadata_paths = list(detect_editable_installs_via_metadata())
    all_paths.extend(metadata_paths)

    # Remove duplicates while preserving order
    unique_paths = list(dict.fromkeys(all_paths))

    # Add to sys.path
    if force:
        # Remove existing paths first
        for path in unique_paths:
            while path in sys.path:
                sys.path.remove(path)

    paths_added = add_paths_to_sys_path(unique_paths, prepend=False)

    if logger:
        logger.blank()
        logger.info("Results:")
        with logger.indent():
            logger.info(f"- {pth_files_count} .pth files scanned")
            logger.info(f"- {len(egg_link_paths)} .egg-link files found")
            logger.info(f"- {len(metadata_paths)} editable installs via metadata")
            logger.info(f"- {len(unique_paths)} total unique editable paths")
            logger.info(f"- {paths_added} paths added to sys.path")

    if unique_paths and logger:
        logger.blank()
        logger.info("Editable install paths:")
        with logger.indent():
            for path in unique_paths:
                logger.info(f"- {path}")

    return PthProcessingResult(
        site_dirs_scanned=len(site_dirs),
        pth_files_found=pth_files_count,
        paths_extracted=unique_paths,
        egg_link_paths=egg_link_paths,
        metadata_paths=metadata_paths,
        paths_added=paths_added,
        total_editable_paths=len(unique_paths),
    )


def get_editable_install_paths() -> set[str]:
    """Get all editable install paths without modifying sys.path.

    Returns:
        Set of absolute paths to editable install directories
    """
    logger = _get_logger()
    if logger:
        logger.debug("get_editable_install_paths() called")

    all_paths = set()

    # From .pth files
    for site_dir in get_site_packages_dirs():
        for pth_file in find_pth_files(site_dir):
            all_paths.update(process_pth_file(pth_file))
        all_paths.update(find_egg_link_paths(site_dir))

    # From metadata
    all_paths.update(detect_editable_installs_via_metadata())

    if logger:
        logger.debug(f"get_editable_install_paths() returning {len(all_paths)} path(s)")
        for path in sorted(all_paths):
            logger.debug(f"  - {path}")

    return all_paths