All files / src/aggregator/metadata resolve-utils.ts

75.9% Statements 63/83
67.24% Branches 39/58
86.66% Functions 13/15
81.25% Lines 52/64

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215                        24x   24x   24x                 24x   24x   24x   24x   24x   24x   24x   24x   24x   24x     24x     235710x 235710x 235710x 235314x 235314x 1362518x                   20076x 20076x                   20076x                 20076x                                         24187x 20085x 20076x   20076x                                                         175555x       106806x 102217x   126505x   126505x 102217x       106806x 106806x             48673x 48673x 43663x 42266x 42266x 1770x   40496x       24187x 24187x 24182x                                             68532x 68532x 68532x                     75132x 176048x   1379x    
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
// SPDX-License-Identifier: Apache-2.0
 
/**
 * @module Aggregator/Metadata/ResolveUtils
 * @description Low-level utility functions extracted from resolve-helpers
 * to keep each leaf module under the 600-line drift guard.
 */
 
import { extractFirstSentence, shouldSkipDescriptionLine, truncateTitle } from './text-utils.js';
import { findTitleRejectionReason } from './title-rejection.js';
 
const LEAKY_RUNID_RE = /\b[a-z][a-z-]*-run-?\d+-\d{8,}\b/iu;
/** Matches workflow run-number patterns like "Run 271" or "— Run 42" in titles. */
const RUN_NUMBER_RE = /(?:^|[\s—–\-(,;:|/])Run\s+\d+/u;
/** Pipeline-jargon and internal-token patterns that must never leak into SEO copy. */
const SEO_PIPELINE_PATTERNS: readonly RegExp[] = Object.freeze([
  /\bStage\s*[A-E]\b/iu,
  /\bpre-?fetch(?:ed|ing)?\b/iu,
  /\bfeeds?\s+were\s+pre-?fetched\b/iu,
  /\bscripts\//iu,
  /#\d+\b/u,
  /\b\d{2}:\d{2}:\d{2}Z\b/u,
]);
/** Word-level strip pattern for "Run N" tokens (with optional hyphenated suffix). */
const RUN_TOKEN_STRIP_RE = /\bRun\s+\d[\d-]*/giu;
/** Internal run-id slug strip (e.g. `breaking-run180-1779846371`). */
const RUNID_TOKEN_STRIP_RE = /\b[a-z][a-z-]*-run-?\d+-\d{8,}[\s,;:|/]*/giu;
/** "analysis run" phrase strip. */
const ANALYSIS_RUN_STRIP_RE = /\banalysis\s+run\s*\d*[\s,;:|/]*/giu;
/** Word-level strip for Stage A-E pipeline markers. */
const STAGE_TOKEN_STRIP_RE = /\bStage\s*[A-E]\b[\s,;:|/\\-]*/giu;
/** Word-level strip for "prefetch"/"pre-fetched" pipeline jargon. */
const PREFETCH_TOKEN_STRIP_RE = /\bpre-?fetch(?:ed|ing)?\b[\s,;:|/\\-]*/giu;
/** Strip explicit "feeds were pre-fetched" sentence fragments. */
const PREFETCH_FEEDS_STRIP_RE = /\bfeeds?\s+were\s+pre-?fetched\b[\s,;:|/\\-]*/giu;
/** Strip leaked internal script paths. */
const SCRIPTS_PATH_STRIP_RE = /\bscripts\/[^\s,;:|)\]]*/giu;
/** Strip leaked issue/run-number tokens like "#265". */
const HASH_NUMBER_STRIP_RE = /#\d+\b/gu;
/** Strip leaked internal timestamps like "05:40:10Z". */
const INTERNAL_TIME_STRIP_RE = /\b\d{2}:\d{2}:\d{2}Z\b/gu;
/** All-caps document-reference prefix (e.g. "KJ-01: ", "SITUATION: "). */
const DOC_REF_PREFIX_RE = /^[A-Z][A-Z0-9 -]{1,40}:\s+/u;
 
/** Minimum title length below which a title is unusable. */
const SEO_TITLE_FLOOR = 20;
 
export function hasLeakySeoToken(value: string): boolean {
  Iif (!value) return false;
  const lower = value.toLowerCase();
  if (lower.includes('analysis run')) return true;
  Iif (LEAKY_RUNID_RE.test(value)) return true;
  if (RUN_NUMBER_RE.test(value)) return true;
  return SEO_PIPELINE_PATTERNS.some((pattern) => pattern.test(value));
}
 
/**
 * Word-level strip of leaky workflow tokens from a single line of text.
 *
 * @param value - Raw text that may contain workflow run tokens
 * @returns Cleaned text with all leaky run tokens removed
 */
export function stripLeakyRunTokens(value: string): string {
  Iif (!value) return '';
  let cleaned = value
    .replace(RUNID_TOKEN_STRIP_RE, '')
    .replace(RUN_TOKEN_STRIP_RE, '')
    .replace(ANALYSIS_RUN_STRIP_RE, '')
    .replace(STAGE_TOKEN_STRIP_RE, '')
    .replace(PREFETCH_FEEDS_STRIP_RE, '')
    .replace(PREFETCH_TOKEN_STRIP_RE, '')
    .replace(SCRIPTS_PATH_STRIP_RE, '')
    .replace(HASH_NUMBER_STRIP_RE, '')
    .replace(INTERNAL_TIME_STRIP_RE, '');
  cleaned = cleaned
    .replace(/\(\s*[,;:|/\-—–]+\s*/gu, '(')
    .replace(/\s*[,;:|/\-—–]+\s*\)/gu, ')')
    .replace(/\(\s*\)/gu, '')
    .replace(/\s*[,;:|/]\s*[,;:|/]+\s*/gu, ', ')
    .replace(/^[\s,;:|/\-—–]+/u, '')
    .replace(/[\s,;:|/\-—–]+$/u, '')
    .replace(/\s{2,}/gu, ' ')
    .trim();
  return cleaned;
}
 
/**
 * Sanitize a single-line title candidate by word-level stripping any
 * leaky workflow tokens.
 *
 * Salvage is only attempted when the contamination is a clean
 * prefix/suffix tag (e.g. `Run 180, 17 April 2026` → `17 April 2026`).
 * When the headline embeds the phrase `analysis run` the contamination
 * is structural (an editorial-paragraph leak embedded inside parens or
 * other punctuation) — token-level stripping would leave dangling
 * fragments like `Analysis ) | …`. In that case we refuse to salvage and
 * return the empty string so the caller falls through to the
 * summary-derived title.
 *
 * @param value - Raw title candidate that may contain run tokens
 * @returns Sanitized title with leaky tokens removed, or empty string
 *   when the contamination is too structural to safely salvage
 */
export function sanitizeTitleCandidate(value: string): string {
  if (!value) return '';
  if (/\banalysis\s+run\b/iu.test(value)) return '';
  const stripped = stripLeadingFragmentSeparator(stripLeakyRunTokens(value));
  // Strip all-caps document-reference prefixes (KJ-01:, SITUATION:, etc.)
  return stripped.replace(DOC_REF_PREFIX_RE, '');
}
 
/**
 * Extract a run number from a runId like `committee-reports-run47`,
 * `breaking-run188`, `committee-reports-run-47`, or a bare numeric
 * string (`"47"`). Returns the run number as a string, or `null` when
 * the runId does not carry a discriminator.
 *
 * @param runId - Raw run identifier token
 * @returns Extracted numeric portion, or `null` when absent
 */
export function extractRunNumber(runId: string): string | null {
  if (!runId) return null;
  if (/^\d+$/u.test(runId)) return runId;
  const segments = runId.split('-');
  for (let i = 0; i < segments.length; i += 1) {
    const seg = segments[i] ?? '';
    const m = /^run(\d+)$/u.exec(seg);
    if (m) return m[1] ?? null;
    if (seg === 'run') {
      const next = segments[i + 1];
      if (next && /^\d+$/u.test(next)) return next;
    }
  }
  return null;
}
 
function stripLeadingFragmentSeparator(value: string): string {
  return value.replace(/^[:;—–-]\s+/u, '').trim();
}
 
function stripLeakySentences(value: string): string {
  if (!value) return '';
  const parts = value
    .split(/(?<=[.!?])\s+/u)
    .map((part) => part.trim())
    .filter(Boolean);
  const clean = parts.filter((part) => !hasLeakySeoToken(part));
  return (clean.length > 0 ? clean : parts).join(' ').trim();
}
 
export function sanitizeDescriptionCandidate(value: string): string {
  const cleaned = stripLeadingFragmentSeparator(stripLeakySentences(value));
  return cleaned && !shouldSkipDescriptionLine(cleaned) ? cleaned : '';
}
 
export function isUsableResolvedTitle(
  value: string,
  options?: { readonly allowFullSentence?: boolean }
): boolean {
  const cleaned = stripLeadingFragmentSeparator(value);
  if (cleaned.length < SEO_TITLE_FLOOR) return false;
  if (hasLeakySeoToken(cleaned)) return false;
  const reason = findTitleRejectionReason(cleaned);
  if (reason && !(options?.allowFullSentence && reason === 'sentence-fragment')) {
    return false;
  }
  return true;
}
 
export function deriveHeadlineFromSummary(summary: string): string {
  const cleaned = sanitizeDescriptionCandidate(summary);
  if (!cleaned) return '';
  return truncateTitle(extractFirstSentence(cleaned) || cleaned);
}
 
/**
 * No-op: run numbers must never appear in user-facing article titles.
 * Preserved for callsite backward compatibility.
 *
 * @param title - Base title (returned unchanged)
 * @param _runId - Raw run identifier token (ignored)
 * @returns The unchanged input title
 */
export function withRunQualifier(title: string, _runId: string): string {
  return title;
}
 
/**
 * Case-insensitive containment check after whitespace normalization.
 *
 * @param haystack - Text to search within
 * @param needle - Substring to look for
 * @returns `true` when `needle` is found within `haystack`
 */
export function containsNormalized(haystack: string, needle: string): boolean {
  const cleanHaystack = haystack.toLowerCase().replace(/\s+/g, ' ');
  const cleanNeedle = needle.toLowerCase().replace(/\s+/g, ' ');
  return cleanNeedle.length > 0 && cleanHaystack.includes(cleanNeedle);
}
 
/**
 * Return the first non-empty, trimmed entry from a candidate list, or
 * the empty string when every entry is blank.
 *
 * @param candidates - Ordered list of candidate strings
 * @returns First non-blank candidate, or empty string
 */
export function pickFirstNonEmpty(candidates: readonly string[]): string {
  for (const c of candidates) {
    if (typeof c === 'string' && c.trim().length > 0) return c.trim();
  }
  return '';
}