Add:HTML sanitizer lib to support html in podcasts and replace strip html lib

This commit is contained in:
advplyr 2022-05-27 19:41:40 -05:00
parent 96232676cb
commit c4bfa266b0
9 changed files with 1051 additions and 252 deletions

View file

@ -0,0 +1,28 @@
const sanitizeHtml = require('../libs/sanitizeHtml')
function sanitize(html) {
const sanitizerOptions = {
allowedTags: [
'p', 'ol', 'ul', 'a', 'strong', 'em'
],
disallowedTagsMode: 'discard',
allowedAttributes: {
a: ['href', 'name', 'target']
},
allowedSchemes: ['https'],
allowProtocolRelative: false
}
return sanitizeHtml(html, sanitizerOptions)
}
module.exports.sanitize = sanitize
function stripAllTags(html) {
const sanitizerOptions = {
allowedTags: [],
disallowedTagsMode: 'discard'
}
return sanitizeHtml(html, sanitizerOptions)
}
module.exports.stripAllTags = stripAllTags

View file

@ -1,5 +1,5 @@
const { xmlToJSON } = require('./index')
const { stripHtml } = require("string-strip-html")
const htmlSanitizer = require('./htmlSanitizer')
function parseCreators(metadata) {
if (!metadata['dc:creator']) return null
@ -57,8 +57,7 @@ function fetchDescription(metadata) {
// check if description is HTML or plain text. only plain text allowed
// calibre stores < and > as &lt; and &gt;
description = description.replace(/&lt;/g, '<').replace(/&gt;/g, '>')
if (description.match(/<!DOCTYPE html>|<\/?\s*[a-z-][^>]*\s*>|(\&(?:[\w\d]+|#\d+|#x[a-f\d]+);)/)) return stripHtml(description).result
return description
return htmlSanitizer.stripAllTags(description)
}
function fetchGenres(metadata) {

View file

@ -1,6 +1,6 @@
const Logger = require('../Logger')
const { xmlToJSON } = require('./index')
const { stripHtml } = require('string-strip-html')
const htmlSanitizer = require('../utils/htmlSanitizer')
function extractFirstArrayItem(json, key) {
if (!json[key] || !json[key].length) return null
@ -55,8 +55,9 @@ function extractPodcastMetadata(channel) {
}
if (channel['description']) {
metadata.description = extractFirstArrayItem(channel, 'description')
metadata.descriptionPlain = stripHtml(metadata.description || '').result
const rawDescription = extractFirstArrayItem(channel, 'description') || ''
metadata.description = htmlSanitizer.sanitize(rawDescription)
metadata.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription)
}
var arrayFields = ['title', 'language', 'itunes:explicit', 'itunes:author', 'pubDate', 'link']
@ -81,8 +82,9 @@ function extractEpisodeData(item) {
}
if (item['description']) {
episode.description = extractFirstArrayItem(item, 'description')
episode.descriptionPlain = stripHtml(episode.description || '').result
const rawDescription = extractFirstArrayItem(item, 'description') || ''
episode.description = htmlSanitizer.sanitize(rawDescription)
episode.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription)
}
var arrayFields = ['title', 'pubDate', 'itunes:episodeType', 'itunes:season', 'itunes:episode', 'itunes:author', 'itunes:duration', 'itunes:explicit', 'itunes:subtitle']