<?php

declare(strict_types=1);

namespace App\Controller;

use Cake\Http\Client;
use Cake\Log\Log;
use Cake\Cache\Cache;
use App\Service\FaqRetrieverService;

class OllamaController extends AppController
{
    private const SV_URLS = [
        // Prioritise IBL so fallback selections (e.g., on short confirmations) include it
        'https://www.monash.edu/it/industry-based-learning',
        'https://supervisorconnect.it.monash.edu/projects/research',
        'https://supervisorconnect.it.monash.edu/supervisors',
        'https://supervisorconnect.med.monash.edu/research-projects',
        'https://www.monash.edu/it/dsai/people',
        'https://www.monash.edu/it/hcc/people',
        'https://www.monash.edu/it/ssc/people',
        'https://www.monash.edu/it/digital-health/people',
        'https://www.monash.edu/it/sustainability/people',
        'https://ailecs.org/our-team/',
        'https://www.monash.edu/colam/people',
        'https://www.monash.edu/mats/people',
        'https://www.monash.edu/it/research/research-centres-and-labs',
        'https://www.monash.edu/it/future-students',
        'https://www.monash.edu/it/research',
        'https://www.monash.edu/it/industry-and-community',
        'https://www.monash.edu/it/current-students',
        'https://www.monash.edu/it/about-us',
        'https://www.monash.edu/it/news',
        'https://www.monash.edu/it/events',
        'https://www.monash.edu/it/about-us/diac'
    ];

    /**
     * Fetch supervisor profiles from the index page and extract contact info.
     * We score links by query token overlap, fetch top matches, and pull email/phone from the profile page.
     *
     * @return array<int, array{url:string, text:string, kind:string}>
     */
    private function fetchSupervisorProfiles(string $query, Client $http, int $maxProfiles = 5, int $timeout = 30): array
    {
        $indexUrl = 'https://supervisorconnect.it.monash.edu/supervisors';

        try {
            $res = $http->get($indexUrl);
            if (!$res->isOk()) { return []; }
            $html = $res->getStringBody();
        } catch (\Throwable $e) {
            Log::warning('SV index fetch failed: ' . $e->getMessage());
            return [];
        }

        // very light anchor extraction
        preg_match_all('#<a[^>]+href="(/supervisors/[^"]+)"[^>]*>(.*?)</a>#is', $html, $m, PREG_SET_ORDER);
        if (!$m) { return []; }

        // tokenise the user query
        $q = mb_strtolower(preg_replace('/[^\p{L}\p{N}\s-]/u', ' ', $query));
        $tokens = array_values(array_filter(explode(' ', $q), fn($t) => mb_strlen($t) >= 3));

        // score links by overlap of tokens with anchor text and href slug
        $scored = [];
        foreach ($m as $a) {
            $href = html_entity_decode($a[1], ENT_QUOTES | ENT_HTML5);
            $text = strip_tags($a[2] ?? '');
            $textL = mb_strtolower($text);
            $slug  = mb_strtolower(basename($href));

            // Skip non-person pages if any
            if ($href === '/supervisors') { continue; }

            $score = 0;
            foreach ($tokens as $t) {
                if ($t !== '' && (str_contains($textL, $t) || str_contains($slug, $t))) {
                    $score++;
                }
            }

            // small baseline to include unspecific (e.g., "AI") queries
            if ($score === 0 && !empty($tokens)) { $score = 0.1; }

            $scored[] = ['score' => $score, 'href' => $href, 'name' => trim($text)];
        }

        // keep top N non-zero-scored (or keep a couple best if all near-zero)
        usort($scored, fn($a,$b) => $b['score'] <=> $a['score']);
        $scored = array_slice($scored, 0, max(1, $maxProfiles));

        $chunks = [];
        foreach ($scored as $s) {
            $url = 'https://supervisorconnect.it.monash.edu' . $s['href'];

            try {
                $res = $http->get($url, [], ['timeout' => $timeout]);
                if (!$res->isOk()) { continue; }
                $page = $res->getStringBody();

                // Extract visible-ish text for context + contact info via regex
                $pageText = preg_replace('#\s+#', ' ', trim(strip_tags($page)));

                // Try mailto first
                $emails = [];
                if (preg_match_all('#mailto:([A-Z0-9._%+\-]+@monash\.edu)#i', $page, $em)) {
                    $emails = array_values(array_unique($em[1]));
                }
                // Also catch plain-text monash emails
                if (preg_match_all('#\b[A-Z0-9._%+\-]+@monash\.edu\b#i', $page, $em2)) {
                    $emails = array_values(array_unique(array_merge($emails, $em2[0])));
                }
                // Optional: phone
                $phones = [];
                if (preg_match_all('#\+?\d[\d\-\s()]{6,}#', $page, $ph)) {
                    // crude filter for very short matches
                    $phones = array_values(array_filter(array_map('trim', $ph[0]), fn($p) => strlen($p) >= 8));
                }

                // Build a concise text block prioritising contact
                $contactLines = [];
                if (!empty($emails)) { $contactLines[] = 'Email: ' . implode(', ', $emails); }
                if (!empty($phones)) { $contactLines[] = 'Phone: ' . implode(', ', $phones); }

                $name = $s['name'] ?: 'Supervisor';
                $header = $name . ' — Supervisor Connect Profile';
                $body = !empty($contactLines)
                    ? implode(' | ', $contactLines)
                    : mb_substr($pageText, 0, 400); // fallback to a short slice

                $chunks[] = [
                    'url'  => $url,
                    'text' => $header . "\n" . $body,
                    'kind' => 'sv_profile',
                ];
            } catch (\Throwable $e) {
                Log::warning('SV profile fetch failed for ' . $url . ': ' . $e->getMessage());
                continue;
            }
        }

        return $chunks;
    }

    /**
     * Crawl Monash "people" index pages (e.g., DIAC/HCC/DSAI) and fetch top matching profiles.
     * Returns pre-formatted markdown lines per person (name, area/spec, email, profile link).
     */
    private function fetchMonashPeopleProfiles(string $query, Client $http, array $indexUrls, int $maxProfiles = 5, int $timeout = 30): array
    {
        // tokenise the user query
        $q = mb_strtolower(preg_replace('/[^\p{L}\p{N}\s-]/u', ' ', $query));
        $tokens = array_values(array_filter(explode(' ', $q), fn($t) => mb_strlen($t) >= 2));

        $candidateLinks = [];

        foreach ($indexUrls as $indexUrl) {
            try {
                $res = $http->get($indexUrl);
                if (!$res->isOk()) { continue; }
                $html = $res->getStringBody();
            } catch (\Throwable $e) {
                Log::warning('People index fetch failed for ' . $indexUrl . ': ' . $e->getMessage());
                continue;
            }

            // Common patterns on monash /people pages: anchors around names or cards
            // 1) Anchor tags with href to a profile and visible name text
            preg_match_all('#<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>#is', $html, $aMatches, PREG_SET_ORDER);
            foreach ($aMatches as $a) {
                $href = html_entity_decode($a[1], ENT_QUOTES | ENT_HTML5);
                $label = trim(strip_tags($a[2] ?? ''));
                if ($label === '') continue;

                // Try to filter to likely person links (contain /people/ or /research/ or look like staff profile)
                // Accept more Monash person-like links, including research.monash.edu/en/persons/...,
                // and fall back to relative links or name-looking labels.
                $accept = false;

                $u    = parse_url($href);
                $host = $u['host'] ?? '';
                $path = $u['path'] ?? '';

                // Absolute monash hosts with common “people-ish” paths
                if (preg_match('#(^|\.)(monash\.edu)$#i', $host)) {
                    if (preg_match('#/(people|profiles|en/persons|staff|our-people|directory)/#i', $path)) {
                        $accept = true;
                    }
                }

                // Relative URLs (we absolutize later)
                if (!$accept && str_starts_with($href, '/')) {
                    $accept = true;
                }

                // Fallback: label looks like a human name “First Last”
                if (!$accept) {
                    if (preg_match('/^[A-Za-z][A-Za-z\-’\']+(?:\s+[A-Za-z][A-Za-z\-’\']+){1,2}$/', $label)) {
                        $accept = true;
                    }
                }

                if (!$accept) {
                    continue;
                }


                // Normalise URL (relative → absolute)
                if (preg_match('#^https?://#i', $href) !== 1) {
                    // build absolute from indexUrl
                    $base = parse_url($indexUrl);
                    $href = ($base['scheme'] ?? 'https') . '://' . ($base['host'] ?? 'www.monash.edu')
                        . (str_starts_with($href, '/') ? $href : '/' . ltrim($href, '/'));
                }

                // score by token overlap on label + slug
                $labelL = mb_strtolower($label);
                $slug   = mb_strtolower(basename(parse_url($href, PHP_URL_PATH) ?? ''));
                $score  = 0;
                foreach ($tokens as $t) {
                    if ($t !== '' && (str_contains($labelL, $t) || str_contains($slug, $t))) {
                        $score += 2; // label hit = stronger
                    }
                }
                // mild baseline so generic queries still get some people back
                if ($score === 0 && !empty($tokens)) { $score = 0.1; }

                $candidateLinks[] = ['score' => $score, 'url' => $href, 'name' => $label];
            }

            // 2) Names wrapped in headings within cards (h2/h3/h4)
            preg_match_all('#<(h2|h3|h4)[^>]*>(.*?)</\1>#is', $html, $hMatches, PREG_SET_ORDER);
            foreach ($hMatches as $h) {
                $label = trim(strip_tags($h[2] ?? ''));
                if ($label === '' || mb_strlen($label) < 3) continue;
                // try to find a nearby anchor link around the same card
                // (simple heuristic: look for nearest preceding <a href="..."> tag)
                if (preg_match('#<a[^>]+href="([^"]+)"[^>]*>[^<]*' . preg_quote($label, '#') . '[^<]*</a>#is', $html, $near)) {
                    $href = html_entity_decode($near[1], ENT_QUOTES | ENT_HTML5);
                    if (preg_match('#^https?://#i', $href) !== 1) {
                        $base = parse_url($indexUrl);
                        $href = ($base['scheme'] ?? 'https') . '://' . ($base['host'] ?? 'www.monash.edu')
                            . (str_starts_with($href, '/') ? $href : '/' . ltrim($href, '/'));
                    }
                    $labelL = mb_strtolower($label);
                    $slug   = mb_strtolower(basename(parse_url($href, PHP_URL_PATH) ?? ''));
                    $score  = 0;
                    foreach ($tokens as $t) {
                        if ($t !== '' && (str_contains($labelL, $t) || str_contains($slug, $t))) {
                            $score += 2;
                        }
                    }
                    if ($score === 0 && !empty($tokens)) { $score = 0.1; }
                    $candidateLinks[] = ['score' => $score, 'url' => $href, 'name' => $label];
                }
            }
        }

        if (empty($candidateLinks)) return [];

        // Dedup by URL, keep best score
        $byUrl = [];
        foreach ($candidateLinks as $c) {
            $u = $c['url'];
            if (!isset($byUrl[$u]) || $c['score'] > $byUrl[$u]['score']) {
                $byUrl[$u] = $c;
            }
        }
        $candidates = array_values($byUrl);

        usort($candidates, fn($a,$b) => $b['score'] <=> $a['score']);
        $candidates = array_slice($candidates, 0, max(1, $maxProfiles));

        // Fetch each profile page; pull email & a short spec line
        $chunks = [];
        foreach ($candidates as $c) {
            $url  = $c['url'];
            $name = $c['name'] ?: 'Staff';

            try {
                $res = $http->get($url, [], ['timeout' => $timeout]);
                if (!$res->isOk()) { continue; }
                $page = $res->getStringBody();

                $pageText = preg_replace('#\s+#', ' ', trim(strip_tags($page)));

                // emails
                $emails = [];
                if (preg_match_all('#mailto:([A-Z0-9._%+\-]+@monash\.edu)#i', $page, $em)) {
                    $emails = array_values(array_unique($em[1]));
                }
                if (preg_match_all('#\b[A-Z0-9._%+\-]+@monash\.edu\b#i', $page, $em2)) {
                    $emails = array_values(array_unique(array_merge($emails, $em2[0])));
                }

                // crude “area/spec” extraction
                $spec = 'N/A';
                if (preg_match('#(Research interests|Areas of expertise|Expertise|Research area)s?\s*[:\-]?\s*(.{40,300})#i', $pageText, $ri)) {
                    $grab = trim($ri[2]);
                    $spec = preg_split('/(?<=[.?!])\s+/', $grab)[0] ?? $grab;
                    $spec = mb_substr($spec, 0, 160);
                }

                // area keywords (quick guess)
                $area = 'N/A';
                foreach (['machine learning','computer vision','natural language processing','data science','cybersecurity','software engineering','human-centred computing','human centered computing','optimization','bayesian','reinforcement learning','ai'] as $kw) {
                    if (stripos($pageText, $kw) !== false) { $area = $kw; break; }
                }

                // build markdown
                $lines = [];
                $lines[] = "- **{$name}**";
                $lines[] = "  - Research area: **" . $area . "**";
                $lines[] = "  - Specialises in " . $spec;
                if (!empty($emails)) {
                    $mailLinks = array_map(fn($em) => "[{$em}](mailto:{$em})", $emails);
                    $lines[] = "  - Email: " . implode(', ', $mailLinks);
                } else {
                    $lines[] = "  - Email: N/A";
                }
                $lines[] = "  - Profile link: [{$url}]({$url})";

                $chunks[] = [
                    'url'  => $url,
                    'text' => implode("\n", $lines),
                    'kind' => 'monash_profile',
                ];
            } catch (\Throwable $e) {
                Log::warning('Monash profile fetch failed for ' . $url . ': ' . $e->getMessage());
                continue;
            }
        }

        return $chunks;
    }

    private function findFaqSnippets(string $query, int $k = 3): array
{
    $Faqs = $this->fetchTable('Faqs');

    // Try FULLTEXT (MySQL); if you didn't add the index yet, skip to the LIKE block
    $rows = [];
    try {
        $quoted = $Faqs->getConnection()->quote($query);
        $match  = "MATCH(question, answer) AGAINST ($quoted IN NATURAL LANGUAGE MODE)";
        $rows = $Faqs->find()
            ->select(['id','question','answer','link'])
            ->where(function($exp) use ($match){ return $exp->add($match); })
            ->orderDesc($match)
            ->limit($k)
            ->enableHydration(false)
            ->all()
            ->toList();
    } catch (\Throwable $e) {
        // fall through to LIKE
    }

    if (count($rows) === 0) {
        $rows = $Faqs->find()
            ->select(['id','question','answer','link'])
            ->where(['OR' => [
                'question LIKE' => '%' . $query . '%',
                'answer LIKE'   => '%' . $query . '%',
                'tags LIKE'     => '%' . $query . '%',
            ]])
            ->limit($k)
            ->enableHydration(false)
            ->all()
            ->toList();
    }

    // Format as context chunks the model can cite
    $chunks = [];
    foreach ($rows as $r) {
        $src = $r['link'] ?: 'FAQ';
        $chunks[] = [
            'url'  => $src,
            'text' => "Q: {$r['question']}\nA: {$r['answer']}",
            'kind' => 'faq',
        ];
    }
    return $chunks;
}


    /**
     * Rank generic context chunks by simple token overlap with the user query.
     * Expects chunks in the shape ['url' => string, 'text' => string, 'kind' => string].
     */
    private function rankChunksByQuery(array $chunks, string $query, int $topN = 3): array
    {
        if (empty($chunks) || trim($query) === '') { return array_slice($chunks, 0, max(0, $topN)); }

        $q = mb_strtolower(preg_replace('/[^\p{L}\p{N}\s-]/u', ' ', $query));
        $tokens = array_values(array_filter(explode(' ', $q), fn($t) => mb_strlen($t) >= 2));
        if (empty($tokens)) { return array_slice($chunks, 0, max(0, $topN)); }

        $scored = [];
        foreach ($chunks as $c) {
            $hay = mb_strtolower(($c['url'] ?? '') . ' ' . mb_substr((string)($c['text'] ?? ''), 0, 800));
            $score = 0.0;
            foreach ($tokens as $t) {
                if ($t !== '' && str_contains($hay, $t)) { $score += 1.0; }
            }
            // Tiny boost when URL path contains a full token match (e.g., "/diac")
            $path = mb_strtolower(parse_url((string)($c['url'] ?? ''), PHP_URL_PATH) ?? '');
            foreach ($tokens as $t) {
                if ($t !== '' && $path !== '' && preg_match('#(^|/)' . preg_quote($t, '#') . '(/|$)#', $path)) {
                    $score += 1.5;
                }
            }
            $scored[] = ['score' => $score, 'chunk' => $c];
        }

        usort($scored, fn($a,$b) => $b['score'] <=> $a['score']);
        $sorted = array_map(fn($s) => $s['chunk'], $scored);
        return array_slice($sorted, 0, max(0, $topN));
    }

    /**
     * Extract http/https URLs from the user's prompt text.
     */
    private function extractUrlsFromText(string $text, int $max = 3): array
    {
        $urls = [];
        if (preg_match_all('#https?://[^\s)]+#i', $text, $m)) {
            foreach ($m[0] as $u) {
                $u = rtrim($u, '.,;)]}');
                $urls[$u] = true;
                if (count($urls) >= $max) { break; }
            }
        }
        return array_keys($urls);
    }

    /**
     * Canonicalize specific Monash URLs to avoid variant links (e.g., IBL page).
     */
    private function canonicalizeMonashUrl(string $url): string
    {
        $u = trim($url);
        // Normalise scheme and trailing slash for comparisons
        $uNoSlash = rtrim($u, '/');

        // IBL canonical URL
        $iblCanonical = 'https://www.monash.edu/it/industry-based-learning';
        $iblVariants = [
            'https://www.monash.edu/it/current-students/industry/ibl',
            'http://www.monash.edu/it/current-students/industry/ibl',
            'http://www.monash.edu/it/industry-based-learning',
            'https://monash.edu/it/current-students/industry/ibl',
            'https://monash.edu/it/industry-based-learning',
        ];
        foreach ($iblVariants as $v) {
            if ($uNoSlash === rtrim($v, '/')) {
                return $iblCanonical;
            }
        }
        return $u;
    }

    /**
     * Replace variant URLs in the model's answer text with canonical ones.
     */
    private function canonicalizeAnswerText(string $text): string
    {
        if ($text === '') return $text;
        // IBL variants → canonical
        $patterns = [
            '#https?://(?:www\.)?monash\.edu/it/current-students/industry/ibl/?#i',
            '#http://(?:www\.)?monash\.edu/it/industry-based-learning/?#i',
            '#https?://monash\.edu/it/current-students/industry/ibl/?#i',
            '#https?://monash\.edu/it/industry-based-learning/?#i',
        ];
        $replacement = 'https://www.monash.edu/it/industry-based-learning';
        return preg_replace($patterns, $replacement, $text) ?? $text;
    }

    /**
     * Treat short acknowledgements as confirmations (used to reuse prior query for retrieval).
     */
    private function isConfirmation(string $text): bool
    {
        $t = mb_strtolower(trim($text));
        if ($t === '') return false;
        $confirmations = [
            'y', 'yes', 'yeah', 'yep', 'ok', 'okay', 'sure', 'please', 'please do', 'go ahead', 'affirmative', 'do it', 'sounds good', 'alright'
        ];
        return in_array($t, $confirmations, true)
            || preg_match('/^(yes|yeah|yep|ok|okay|sure)(,?\s+please(\s+do)?)?[.!?]?$/i', $t) === 1
            || preg_match('/^(please|go ahead)[.!?]?$/i', $t) === 1;
    }

    /**
     * From prior conversation, find the last meaningful user query to use for retrieval when the
     * current input is just a short confirmation ("yes").
     *
     * @param array<int, array{role:string, content:string}> $conv
     */
    private function deriveRetrievalQuery(string $currentInput, array $conv): string
    {
        if (!$this->isConfirmation($currentInput)) {
            return $currentInput;
        }

        // Walk history from latest to oldest and return the most recent non-confirmation user message
        for ($i = count($conv) - 1; $i >= 0; $i--) {
            $m = $conv[$i];
            if (($m['role'] ?? '') !== 'user') { continue; }
            $content = trim((string)($m['content'] ?? ''));
            if ($content === '') { continue; }
            if (!$this->isConfirmation($content)) {
                return $content;
            }
        }

        // Fallback: use current input
        return $currentInput;
    }

    /**
     * Fetch a single URL and return a context chunk.
     */
    private function fetchUrlChunk(Client $http, string $url, int $timeout = 30): ?array
    {
        try {
            $res = $http->get($url, [], ['timeout' => $timeout]);
            if (!$res->isOk()) { return null; }
            $html = $res->getStringBody();
            $text = preg_replace('#\s+#', ' ', trim(strip_tags($html)));
            if ($text === '') { return null; }
            return [
                'url'  => $url,
                'text' => mb_substr($text, 0, 2000),
                'kind' => 'user_url',
            ];
        } catch (\Throwable $e) {
            Log::warning('User URL fetch failed for ' . $url . ': ' . $e->getMessage());
            return null;
        }
    }



    public function initialize(): void
    {
        parent::initialize();
        $this->Authentication->allowUnauthenticated(['chat', 'index']);
        $this->Users = $this->fetchTable('Users');
    }

    public function chat()
{
    $this->request->allowMethod(['post']);

    // 0) Input
    $userInput = (string)($this->request->getData('prompt') ?? '');
    if ($userInput === '') {
        return $this->response->withStatus(400)
            ->withType('application/json')
            ->withStringBody(json_encode(['error' => 'prompt is required']));
    }

    // 1) Config
    $apiKey = (string)env('OPENAI_API_KEY', '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'); // ← no hardcoded default
    if ($apiKey === '') {
        return $this->response->withStatus(500)
            ->withType('application/json')
            ->withStringBody(json_encode(['error' => 'OPENAI_API_KEY is not set in environment']));
    }

    // $limit   = max(1, (int)env('SVCONNECT_LIMIT', 6));
    $timeout = (int)env('HTTP_TIMEOUT', 60);
    $model   = (string)env('OPENAI_MODEL', 'gpt-5-chat-latest');

    $http = new Client(['timeout' => $timeout, 'headers' => ['User-Agent' => 'SVConnectBot/1.0']]);

    // 2) Supervisor Connect/Monash snippets (cached). Change key to bust old cache when code changes.
    $snippets = Cache::remember('svconnect_snippets_v3', function () use ($http) {
        $items = [];
        foreach (self::SV_URLS as $url) {
            try {
                $res = $http->get($url);
                if (!$res->isOk()) { continue; }
                $html = $res->getStringBody();
                $text = preg_replace('#\s+#', ' ', trim(strip_tags($html)));
                if ($text) {
                    // keep snippets modest so we can carry more diverse sources
                    $items[] = ['url' => $url, 'text' => mb_substr($text, 0, 1500)];
                }
            } catch (\Throwable $e) {
                Log::warning("SV fetch failed for {$url}: " . $e->getMessage());
            }
        }
        return $items;
    }, 'default');

    // Follow Monash people pages too (DIAC/DSAI/HCC/etc.)
    $peopleIndex = [
        'https://www.monash.edu/it/about-us/diac',
        'https://www.monash.edu/it/dsai/people',
        'https://www.monash.edu/it/hcc/people',
        'https://www.monash.edu/it/ssc/people',
        'https://www.monash.edu/it/digital-health/people',
        'https://www.monash.edu/it/sustainability/people',
    ];
    // Prepare conversation history early so we can derive a retrieval query on confirmations
    $rawHistory = $this->request->getData('history') ?? [];
    $conv = [];
    if (is_array($rawHistory)) {
        $rawHistory = array_slice($rawHistory, -6);
        foreach ($rawHistory as $m) {
            $role = (($m['role'] ?? '') === 'assistant') ? 'assistant' : 'user';
            $text = (string)($m['content'] ?? $m['text'] ?? '');
            if ($text === '') continue;
            if (mb_strlen($text) > 2000) { $text = mb_substr($text, 0, 2000); }
            $conv[] = ['role'=>$role, 'content'=>$text];
        }
    }

    $retrievalQuery = $this->deriveRetrievalQuery($userInput, $conv);

    $monashPeopleChunks = $this->fetchMonashPeopleProfiles($retrievalQuery, $http, $peopleIndex, (int)env('MONASH_PROFILE_LIMIT', 4));

    // 2.5) If the user pasted URLs in the prompt, fetch them and prioritise as context
    $userUrlChunks = [];
    $userUrls = $this->extractUrlsFromText($userInput, 3);
    foreach ($userUrls as $u) {
        $c = $this->fetchUrlChunk($http, $u, $timeout);
        if ($c) { $userUrlChunks[] = $c; }
    }

    // 3) FAQ retrieval (semantic first if available, else keyword)
    $faqChunks = [];
    try {
        if (method_exists($this, 'findFaqSnippetsSemantic')) {
            // Expand common acronyms/aliases to help retrieval ranking
            $augQuery = $retrievalQuery;
            if (preg_match('/\bibl\b/i', $augQuery) || preg_match('/industry\s*-?based\s*learning/i', $augQuery)) {
                $augQuery .= ' industry based learning industry-based learning monash IBL program';
            }
            $faqChunks = $this->findFaqSnippetsSemantic($augQuery, 3);
        }
    } catch (\Throwable $e) {
        // ignore and fall back
    }
    if (empty($faqChunks) && method_exists($this, 'findFaqSnippets')) {
        try {
            $augQuery = $retrievalQuery;
            if (preg_match('/\bibl\b/i', $augQuery) || preg_match('/industry\s*-?based\s*learning/i', $augQuery)) {
                $augQuery .= ' industry based learning industry-based learning monash IBL program';
            }
            $faqChunks = $this->findFaqSnippets($augQuery, 3);
        } catch (\Throwable $e) {
            // ignore; keep empty
        }
    }

    // 4) Merge context (FAQ + SV)
    $svProfileChunks = $this->fetchSupervisorProfiles($retrievalQuery, $http, (int)env('SV_PROFILE_LIMIT', 3));
    $svChunks        = array_map(fn($s) => ['url'=>$s['url'], 'text'=>$s['text'], 'kind'=>'sv'], $snippets);

    // Prioritise SV/Monash pages that match the user's query (e.g., DIAC) so they aren't dropped by the cap
    $augQuery = $retrievalQuery;
    $isIblIntent = preg_match('/\bibl\b/i', $augQuery) || preg_match('/industry\s*-?based\s*learning/i', $augQuery);
    if ($isIblIntent) {
        $augQuery .= ' industry based learning industry-based learning monash IBL program';
    }
    $svRelevant = $this->rankChunksByQuery($svChunks, $augQuery, (int)env('SV_PAGES_TOPN', 3));

    // If IBL intent detected, force-include the official IBL page at the front of relevant SV pages
    if ($isIblIntent) {
        $iblUrl = 'https://www.monash.edu/it/industry-based-learning';
        $iblChunk = null;
        foreach ($svChunks as $c) {
            if (($c['url'] ?? '') === $iblUrl) { $iblChunk = $c; break; }
        }
        if ($iblChunk) {
            // Dedup: ensure it's not already first
            $hasIbl = false;
            foreach ($svRelevant as $rc) { if (($rc['url'] ?? '') === $iblUrl) { $hasIbl = true; break; } }
            if (!$hasIbl) { array_unshift($svRelevant, $iblChunk); }
        }
    }

    // Compose in a relevance-friendly order and cap
    $finalChunks = array_slice(
        array_merge($userUrlChunks, $faqChunks, $svProfileChunks, $svRelevant, $monashPeopleChunks),
        0,
        8
    );

    // Canonicalize any known variant URLs in the final chunks before building context
    foreach ($finalChunks as &$chunk) {
        if (isset($chunk['url'])) {
            $chunk['url'] = $this->canonicalizeMonashUrl((string)$chunk['url']);
        }
    }
    unset($chunk);

    $contextParts = array_map(fn($c) => "SOURCE: {$c['url']}\n{$c['text']}", $finalChunks);
    $contextBlock = implode("\n\n---\n\n", $contextParts);

    // Build sources list and canonicalize URLs
    $sourceUrls = array_values(array_unique(array_map(function($c){
        $u = (string)($c['url'] ?? '');
        return $this->canonicalizeMonashUrl($u);
    }, $finalChunks)));

    // 5) System prompt (only apologize if no context)
    $hasContext = count($finalChunks) > 0;
    $maxLinks   = max(1, (int)env('SVCONNECT_MAX_LINKS', 4));

        if ($hasContext) {
        $system = <<<SYSTEM
You are the Monash Assistant.

SCOPE AND POLICY:
- Only answer questions about Monash University or study-related topics at Monash (e.g., courses, programs, admissions, scholarships, fees, timetables, supervisors, research projects, campus services, student life, contact points, official processes, staff, advisors).
- If the user asks for anything outside this scope (for example: coding help, general programming, unrelated general knowledge), politely refuse with: "I can only help with Monash University and study-related questions."
- Do not provide off-topic answers or code; do not refer the user elsewhere except via official Monash links.
- Confirmation handling: If the user responds with a short confirmation (e.g., "yes", "yeah", "yep", "ok", "sure", "please do", "go ahead"), treat it as approval for your last offered action (like sharing links) and proceed. Do NOT refuse or ask for clarification.

STRICT LINKING RULES (READ CAREFULLY):
- You may ONLY include hyperlinks that appear exactly in the SOURCES list below.
- Do NOT invent, guess, or modify URLs (no adding /faqs, /apply, query params, anchors, or subpaths).
- If you want to reference a page that is NOT in SOURCES, write plain text (no link).
- Prefer linking to at most {$maxLinks} items.
- If you cannot find any relevant links in SOURCES, respond with a concise answer WITHOUT links.

ANSWERING RULES:
- Use ONLY the provided context (FAQ entries, Supervisor Connect, and Monash IT pages included in the context) to answer. Do NOT invent facts or links.
- Be concise; prefer short bullet points. For lookups, list names/titles with links.
- When listing supervisors, use this exact format per person:
    - **Full Name**
        - Research area: **<one or two words>**
        - Specialises in <short phrase>
        - Email: <email or "N/A">
        - Supervisor profile link: <URL>
- If something is not covered by the context, say you don't have that info and suggest up to {$maxLinks} relevant official Monash links.
SYSTEM;
    } else {
        $system = <<<SYSTEM
You are the Monash Assistant. There is no usable context for this query.
Reply: "I can only help you with research and supervisors at the moment. We will update this in the future. Apologies for any inconvenience."
Then suggest up to {$maxLinks} relevant official links from the provided sources, if any.
SYSTEM;
    }

    // 4) Normalize prior history from the client and build messages -------------

    $messages = [
        ['role' => 'system', 'content' => $system],
        ['role' => 'system', 'content' => "SUPERVISOR CONNECT CONTEXT START\n{$contextBlock}\nSUPERVISOR CONNECT CONTEXT END"],
    ];
    $messages = array_merge($messages, $conv);
    $messages[] = ['role' => 'user', 'content' => $userInput];

    // 6) OpenAI Responses API call
    $url = 'https://api.openai.com/v1/responses';
    $headers = [
        'Authorization' => 'Bearer ' . $apiKey,
        'Content-Type'  => 'application/json',
    ];
    if ($org = env('OPENAI_ORGANIZATION')) { $headers['OpenAI-Organization'] = $org; }
    if ($proj = env('OPENAI_PROJECT'))     { $headers['OpenAI-Project']       = $proj; }

    try {
        $apiRes = $http->post($url, [
            'model' => $model,
            'input' => $messages,
            'temperature' => (float)env('OPENAI_TEMPERATURE', '0.2'),
        ], ['type' => 'json', 'headers' => $headers]);
    } catch (\Throwable $e) {
        Log::error('OpenAI HTTP exception: ' . $e->getMessage());
        return $this->response->withStatus(502)
            ->withType('application/json')
            ->withStringBody(json_encode(['error' => 'OpenAI call threw', 'exception' => $e->getMessage()]));
    }

    if (!$apiRes->isOk()) {
        Log::error('OpenAI non-OK ' . $apiRes->getStatusCode() . ' ' . $apiRes->getStringBody());
        return $this->response->withStatus(502)
            ->withType('application/json')
            ->withStringBody(json_encode([
                'error'  => 'OpenAI call failed',
                'status' => $apiRes->getStatusCode(),
                'body'   => $apiRes->getStringBody(),
            ]));
    }

    // 7) Parse Responses API result
    $data = $apiRes->getJson();
    $text = '';
    if (isset($data['output']) && is_array($data['output'])) {
        foreach ($data['output'] as $part) {
            if (($part['type'] ?? '') === 'message') {
                foreach (($part['content'] ?? []) as $ci) {
                    if (($ci['type'] ?? '') === 'output_text') {
                        $text .= (string)($ci['text'] ?? '');
                    }
                }
            }
        }
    } else {
        $text = (string)($data['output_text'] ?? '');
    }

    // 8) Canonicalize URLs in answer text and return merged sources
    $text = $this->canonicalizeAnswerText($text);
    return $this->response->withType('application/json')
        ->withStringBody(json_encode([
            'text'    => $text,
            'sources' => $sourceUrls,
        ]));
}


    public function index()
    {
    }
}
