Add:HTML sanitizer lib to support html in podcasts and replace strip html lib

2025-08-29 22:29:31 +02:00 · 2022-05-27 19:41:40 -05:00 · 2022-05-27 19:41:40 -05:00 · c4bfa266b0
commit c4bfa266b0
parent 96232676cb
9 changed files with 1051 additions and 252 deletions
--- a/server/utils/htmlSanitizer.js
+++ b/server/utils/htmlSanitizer.js
@ -0,0 +1,28 @@
+const sanitizeHtml = require('../libs/sanitizeHtml')
+
+function sanitize(html) {
+  const sanitizerOptions = {
+    allowedTags: [
+      'p', 'ol', 'ul', 'a', 'strong', 'em'
+    ],
+    disallowedTagsMode: 'discard',
+    allowedAttributes: {
+      a: ['href', 'name', 'target']
+    },
+    allowedSchemes: ['https'],
+    allowProtocolRelative: false
+  }
+
+  return sanitizeHtml(html, sanitizerOptions)
+}
+module.exports.sanitize = sanitize
+
+function stripAllTags(html) {
+  const sanitizerOptions = {
+    allowedTags: [],
+    disallowedTagsMode: 'discard'
+  }
+
+  return sanitizeHtml(html, sanitizerOptions)
+}
+module.exports.stripAllTags = stripAllTags
--- a/server/utils/parseOpfMetadata.js
+++ b/server/utils/parseOpfMetadata.js
@ -1,5 +1,5 @@
 const { xmlToJSON } = require('./index')
-const { stripHtml } = require("string-strip-html")
+const htmlSanitizer = require('./htmlSanitizer')

 function parseCreators(metadata) {
  if (!metadata['dc:creator']) return null
@ -57,8 +57,7 @@ function fetchDescription(metadata) {
  // check if description is HTML or plain text. only plain text allowed
  // calibre stores < and > as &lt; and &gt;
  description = description.replace(/&lt;/g, '<').replace(/&gt;/g, '>')
-  if (description.match(/<!DOCTYPE html>|<\/?\s*[a-z-][^>]*\s*>|(\&(?:[\w\d]+|#\d+|#x[a-f\d]+);)/)) return stripHtml(description).result
-  return description
+  return htmlSanitizer.stripAllTags(description)
 }

 function fetchGenres(metadata) {
--- a/server/utils/podcastUtils.js
+++ b/server/utils/podcastUtils.js
@ -1,6 +1,6 @@
 const Logger = require('../Logger')
 const { xmlToJSON } = require('./index')
-const { stripHtml } = require('string-strip-html')
+const htmlSanitizer = require('../utils/htmlSanitizer')

 function extractFirstArrayItem(json, key) {
  if (!json[key] || !json[key].length) return null
@ -55,8 +55,9 @@ function extractPodcastMetadata(channel) {
  }

  if (channel['description']) {
-    metadata.description = extractFirstArrayItem(channel, 'description')
-    metadata.descriptionPlain = stripHtml(metadata.description || '').result
+    const rawDescription = extractFirstArrayItem(channel, 'description') || ''
+    metadata.description = htmlSanitizer.sanitize(rawDescription)
+    metadata.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription)
  }

  var arrayFields = ['title', 'language', 'itunes:explicit', 'itunes:author', 'pubDate', 'link']
@ -81,8 +82,9 @@ function extractEpisodeData(item) {
  }

  if (item['description']) {
-    episode.description = extractFirstArrayItem(item, 'description')
-    episode.descriptionPlain = stripHtml(episode.description || '').result
+    const rawDescription = extractFirstArrayItem(item, 'description') || ''
+    episode.description = htmlSanitizer.sanitize(rawDescription)
+    episode.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription)
  }

  var arrayFields = ['title', 'pubDate', 'itunes:episodeType', 'itunes:season', 'itunes:episode', 'itunes:author', 'itunes:duration', 'itunes:explicit', 'itunes:subtitle']