<?php
if ( ! defined('ABSPATH') ) exit;

class DCW_BLCL_Scanner {

    /**
     * Kept to satisfy calls from the main plugin loader.
     * Add hooks here in the future if needed.
     */
    public static function init() {
        // No hooks needed for the scanner itself right now.
    }

    /**
     * Run a scan now using saved settings.
     * Returns an array with 'checked' (int), 'broken' (array), 'duration' (float).
     */
    public static function scan_now() {
        $opts = get_option('dcw_blcl_settings', array());
        $opts = wp_parse_args($opts, array(
            'scope'         => 'home', // home|home_plus_latest
            'limit'         => 150,    // max links to check
            'timeout'       => 8,      // seconds per request
            'internal_only' => 0,      // 1 = only same host
        ));

        $start    = microtime(true);
        $pages    = self::collect_pages($opts['scope']);
        $seen     = array();
        $to_check = array();
        $host     = wp_parse_url(home_url(), PHP_URL_HOST);

        foreach ($pages as $page_url) {
            $html = self::fetch_body($page_url, (int)$opts['timeout']);
            if ( $html === '' ) { continue; }

            $links = self::extract_links($html, $page_url);
            foreach ($links as $link) {
                // optionally keep only internal links
                if ( ! empty($opts['internal_only']) ) {
                    $h = wp_parse_url($link, PHP_URL_HOST);
                    if ( $h && $h !== $host ) {
                        continue;
                    }
                }
                // de-dup and respect limit
                $key = md5($link);
                if ( isset($seen[$key]) ) { continue; }
                $seen[$key] = true;
                $to_check[] = $link;
                if ( count($to_check) >= (int)$opts['limit'] ) { break 2; }
            }
        }

        $broken  = array();
        $checked = 0;

        foreach ($to_check as $url) {
            $res = self::check_url($url, (int)$opts['timeout']);
            $checked++;
            if ( ! $res['ok'] ) {
                $broken[] = array(
                    'url'    => $url,
                    'code'   => $res['code'],
                    'reason' => $res['reason'],
                );
            }
        }

        $duration = max(0, microtime(true) - $start);

        return array(
            'checked'  => $checked,
            'broken'   => $broken,
            'duration' => $duration,
        );
    }

    /**
     * Collect a small set of URLs to scan based on scope.
     */
    private static function collect_pages($scope) {
        $urls = array( home_url('/') );

        if ($scope === 'home_plus_latest') {
            // latest 10 posts and pages (you can tune this)
            $latest = get_posts(array(
                'post_type'      => array('post','page'),
                'post_status'    => 'publish',
                'posts_per_page' => 10,
                'orderby'        => 'date',
                'order'          => 'DESC',
                'fields'         => 'ids',
                'no_found_rows'  => true,
            ));
            foreach ($latest as $pid) {
                $u = get_permalink($pid);
                if ($u) { $urls[] = $u; }
            }
        }

        // de-dup
        $urls = array_values(array_unique($urls));
        return $urls;
    }

    /**
     * Fetch page body (HTML). Empty string on failure.
     */
    private static function fetch_body($url, $timeout) {
        $args = array(
            'timeout'     => max(1, (int)$timeout),
            'redirection' => 3,
            'sslverify'   => true,
            'headers'     => array(
                'Accept' => 'text/html,application/xhtml+xml',
            ),
        );
        $resp = wp_remote_get($url, $args);
        if ( is_wp_error($resp) ) { return ''; }
        $code = (int) wp_remote_retrieve_response_code($resp);
        if ( $code < 200 || $code >= 400 ) { return ''; }
        $body = (string) wp_remote_retrieve_body($resp);
        return $body;
    }

    /**
     * Extract links from HTML and resolve them to absolute URLs based on $base_url.
     * Returns array of absolute URLs (strings).
     */
    private static function extract_links($html, $base_url) {
        $links = array();

        // Quick match on href/src
        if ( preg_match_all('#\b(?:href|src)\s*=\s*([\'"])(.*?)\1#i', $html, $m) ) {
            foreach ($m[2] as $raw) {
                $raw = trim($raw);
                if ($raw === '' || strpos($raw, 'javascript:') === 0 || strpos($raw, 'data:') === 0 || $raw === '#') {
                    continue;
                }
                $abs = self::to_absolute_url($raw, $base_url);
                if ($abs) { $links[] = $abs; }
            }
        }

        // Also catch meta refresh (rare)
        if ( preg_match_all('#<meta[^>]+http-equiv=["\']refresh["\'][^>]*content=["\']\d+;\s*url=([^"\']+)["\']#i', $html, $m2) ) {
            foreach ($m2[1] as $raw) {
                $abs = self::to_absolute_url(trim($raw), $base_url);
                if ($abs) { $links[] = $abs; }
            }
        }

        // De-dup
        $links = array_values(array_unique($links));
        return $links;
    }

    /**
     * Build an absolute URL from a possibly relative $url against $base_url.
     */
    private static function to_absolute_url($url, $base_url) {
        // Already absolute?
        if ( preg_match('#^[a-z][a-z0-9+\-.]*://#i', $url) ) {
            return esc_url_raw($url);
        }

        // Protocol-relative
        if ( strpos($url, '//') === 0 ) {
            $scheme = wp_parse_url($base_url, PHP_URL_SCHEME);
            if ( ! $scheme ) { $scheme = 'https'; }
            return esc_url_raw($scheme . ':' . $url); // DOT concatenation (not +)
        }

        // Anchors only
        if ( isset($url[0]) && $url[0] === '#' ) {
            return esc_url_raw( $base_url . $url ); // keep on same page
        }

        $base = wp_parse_url($base_url);
        if ( ! is_array($base) || empty($base['scheme']) || empty($base['host']) ) {
            return '';
        }

        // Path resolution
        $base_path = isset($base['path']) ? $base['path'] : '/';
        if ( substr($base_path, -1) !== '/' ) {
            // remove last segment if base is a file path
            $base_path = preg_replace('#/[^/]*$#', '/', $base_path);
        }

        if ( strpos($url, '/') === 0 ) {
            // root-relative
            $path = self::normalize_path($url);
        } else {
            // relative
            $path = self::normalize_path($base_path . $url); // DOT concatenation
        }

        $port = isset($base['port']) ? ':' . $base['port'] : '';
        $abs  = $base['scheme'] . '://' . $base['host'] . $port . $path;

        return esc_url_raw($abs);
    }

    /**
     * Normalize a URL path: collapse ./ and ../
     */
    private static function normalize_path($path) {
        $parts = explode('/', $path);
        $stack = array();
        foreach ($parts as $p) {
            if ($p === '' || $p === '.') {
                // keep empty to preserve leading slash; ignore single dot
            } elseif ($p === '..') {
                array_pop($stack);
            } else {
                $stack[] = $p;
            }
        }
        $normalized = '/' . implode('/', $stack);
        // Keep trailing slash if original had it
        if ( substr($path, -1) === '/' && substr($normalized, -1) !== '/' ) {
            $normalized .= '/';
        }
        return $normalized;
    }

    /**
     * Check a URL with HEAD first, GET fallback.
     * ok=true if 200–399. Anything >=400 (or network error) is broken.
     */
    private static function check_url($url, $timeout) {
        $args = array(
            'timeout'     => max(1, (int)$timeout),
            'redirection' => 5,
            'sslverify'   => true,
            'headers'     => array( 'Accept' => '*/*' ),
        );

        // HEAD
        $resp = wp_remote_head($url, $args);
        if ( is_wp_error($resp) ) {
            // Fallback to GET (some servers disallow HEAD)
            $resp = wp_remote_get($url, $args);
        }

        if ( is_wp_error($resp) ) {
            return array('ok' => false, 'code' => 0, 'reason' => $resp->get_error_message());
        }

        $code = (int) wp_remote_retrieve_response_code($resp);

        if ( $code >= 200 && $code < 400 ) {
            return array('ok' => true, 'code' => $code, 'reason' => 'OK');
        }

        return array('ok' => false, 'code' => $code, 'reason' => wp_remote_retrieve_response_message($resp));
    }
}
