From 1b584b455106d8346f23ad7b4c7ed38fb0b646dd Mon Sep 17 00:00:00 2001 From: tillcash Date: Thu, 14 Aug 2025 18:01:47 +0530 Subject: [PATCH] [CybernewsBridge] add bridge (#4665) * [CybernewsBridge] add bridge * [CybernewsBridge] fix lint * [CybernewsBridge] add header * [CybernewsBridge] fix url * [CybernewsBridge] fix url 2 * [CybernewsBridge] revert header * [CybernewsBridge] refactor * [CybernewsBridge] final * [CybernewsBridge] lint --- bridges/CybernewsBridge.php | 114 ++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 bridges/CybernewsBridge.php diff --git a/bridges/CybernewsBridge.php b/bridges/CybernewsBridge.php new file mode 100644 index 00000000..64146f6f --- /dev/null +++ b/bridges/CybernewsBridge.php @@ -0,0 +1,114 @@ +url as $entry) { + $url = trim((string) $entry->loc); + $lastmod = trim((string) $entry->lastmod); + + if (!$url) { + continue; + } + + $pathParts = explode('/', trim(parse_url($url, PHP_URL_PATH), '/')); + $category = isset($pathParts[0]) && $pathParts[0] !== '' ? $pathParts[0] : ''; + + // Skip non-English versions + if (in_array($category, ['nl', 'de'], true)) { + continue; + } + + $namespaces = $entry->getNamespaces(true); + $title = ''; + + if (isset($namespaces['news'])) { + $news = $entry->children($namespaces['news'])->news; + + if ($news) { + $title = trim((string) $news->title); + } + } + + if (!$title) { + continue; + } + + $this->items[] = [ + 'title' => $title, + 'uri' => $url, + 'uid' => $url, + 'timestamp' => strtotime($lastmod), + 'categories' => $category ? [$category] : [], + 'content' => $this->fetchFullArticle($url), + ]; + + if (count($this->items) >= self::MAX_ARTICLES) { + break; + } + } + } + + private function fetchFullArticle(string $url): string + { + $html = getSimpleHTMLDOMCached($url); + + if (!$html) { + return 'Unable to fetch article content'; + } + + $article = $html->find('article', 0); + + if (!$article) { + return 'Unable to parse article content'; + } + + // Remove unnecessary elements + $removeSelectors = [ + 'script', + 'style', + 'div.links-bar', + 'div.google-news-cta', + 'div.a-wrapper', + 'div.embed_youtube', + ]; + + foreach ($removeSelectors as $selector) { + foreach ($article->find($selector) as $element) { + $element->outertext = ''; + } + } + + // Handle lazy-loaded images + foreach ($article->find('img') as $img) { + if (!empty($img->{'data-src'})) { + $img->src = $img->{'data-src'}; + unset($img->{'data-src'}); + } + } + + return $article->innertext; + } +}