Add:Epub metadata parser and cover extractor #1479

2025-08-03 01:35:08 +02:00 · 2024-01-07 17:51:07 -06:00 · 2024-01-07 17:51:07 -06:00 · 69e23ef9f2
commit 69e23ef9f2
parent 48a08e9659
8 changed files with 284 additions and 35 deletions
--- a/server/utils/parsers/parseEbookMetadata.js
+++ b/server/utils/parsers/parseEbookMetadata.js
@ -0,0 +1,42 @@
+const parseEpubMetadata = require('./parseEpubMetadata')
+
+/**
+ * @typedef EBookFileScanData
+ * @property {string} path
+ * @property {string} ebookFormat
+ * @property {string} ebookCoverPath internal image path
+ * @property {import('../../scanner/BookScanner').BookMetadataObject} metadata
+ */
+
+/**
+ * Parse metadata from ebook file
+ * 
+ * @param {import('../../models/Book').EBookFileObject} ebookFile 
+ * @returns {Promise<EBookFileScanData>}
+ */
+async function parse(ebookFile) {
+  if (!ebookFile) return null
+
+  if (ebookFile.ebookFormat === 'epub') {
+    return parseEpubMetadata.parse(ebookFile.metadata.path)
+  }
+  return null
+}
+module.exports.parse = parse
+
+/**
+ * Extract cover from ebook file
+ * 
+ * @param {EBookFileScanData} ebookFileScanData 
+ * @param {string} outputCoverPath 
+ * @returns {Promise<boolean>}
+ */
+async function extractCoverImage(ebookFileScanData, outputCoverPath) {
+  if (!ebookFileScanData?.ebookCoverPath) return false
+
+  if (ebookFileScanData.ebookFormat === 'epub') {
+    return parseEpubMetadata.extractCoverImage(ebookFileScanData.path, ebookFileScanData.ebookCoverPath, outputCoverPath)
+  }
+  return false
+}
+module.exports.extractCoverImage = extractCoverImage
--- a/server/utils/parsers/parseEpubMetadata.js
+++ b/server/utils/parsers/parseEpubMetadata.js
@ -0,0 +1,109 @@
+const Path = require('path')
+const Logger = require('../../Logger')
+const StreamZip = require('../../libs/nodeStreamZip')
+const parseOpfMetadata = require('./parseOpfMetadata')
+const { xmlToJSON } = require('../index')
+
+
+/**
+ * Extract file from epub and return string content
+ * 
+ * @param {string} epubPath 
+ * @param {string} filepath 
+ * @returns {Promise<string>}
+ */
+async function extractFileFromEpub(epubPath, filepath) {
+  const zip = new StreamZip.async({ file: epubPath })
+  const data = await zip.entryData(filepath).catch((error) => {
+    Logger.error(`[parseEpubMetadata] Failed to extract ${filepath} from epub at "${epubPath}"`, error)
+  })
+  const filedata = data?.toString('utf8')
+  await zip.close()
+  return filedata
+}
+
+/**
+ * Extract an XML file from epub and return JSON
+ * 
+ * @param {string} epubPath 
+ * @param {string} xmlFilepath 
+ * @returns {Promise<Object>}
+ */
+async function extractXmlToJson(epubPath, xmlFilepath) {
+  const filedata = await extractFileFromEpub(epubPath, xmlFilepath)
+  if (!filedata) return null
+  return xmlToJSON(filedata)
+}
+
+/**
+ * Extract cover image from epub return true if success
+ * 
+ * @param {string} epubPath 
+ * @param {string} epubImageFilepath 
+ * @param {string} outputCoverPath 
+ * @returns {Promise<boolean>}
+ */
+async function extractCoverImage(epubPath, epubImageFilepath, outputCoverPath) {
+  const zip = new StreamZip.async({ file: epubPath })
+
+  const success = await zip.extract(epubImageFilepath, outputCoverPath).then(() => true).catch((error) => {
+    Logger.error(`[parseEpubMetadata] Failed to extract image ${epubImageFilepath} from epub at "${epubPath}"`, error)
+    return false
+  })
+
+  await zip.close()
+
+  return success
+}
+module.exports.extractCoverImage = extractCoverImage
+
+/**
+ * Parse metadata from epub
+ * 
+ * @param {string} epubPath 
+ * @returns {Promise<import('./parseEbookMetadata').EBookFileScanData>}
+ */
+async function parse(epubPath) {
+  Logger.debug(`Parsing metadata from epub at "${epubPath}"`)
+  // Entrypoint of the epub that contains the filepath to the package document (opf file)
+  const containerJson = await extractXmlToJson(epubPath, 'META-INF/container.xml')
+
+  // Get package document opf filepath from container.xml
+  const packageDocPath = containerJson.container?.rootfiles?.[0]?.rootfile?.[0]?.$?.['full-path']
+  if (!packageDocPath) {
+    Logger.error(`Failed to get package doc path in Container.xml`, JSON.stringify(containerJson, null, 2))
+    return null
+  }
+
+  // Extract package document to JSON
+  const packageJson = await extractXmlToJson(epubPath, packageDocPath)
+  if (!packageJson) {
+    return null
+  }
+
+  // Parse metadata from package document opf file
+  const opfMetadata = parseOpfMetadata.parseOpfMetadataJson(packageJson)
+  if (!opfMetadata) {
+    Logger.error(`Unable to parse metadata in package doc with json`, JSON.stringify(packageJson, null, 2))
+    return null
+  }
+
+  const payload = {
+    path: epubPath,
+    ebookFormat: 'epub',
+    metadata: opfMetadata
+  }
+
+  // Attempt to find filepath to cover image
+  const manifestFirstImage = packageJson.package?.manifest?.[0]?.item?.find(item => item.$?.['media-type']?.startsWith('image/'))
+  let coverImagePath = manifestFirstImage?.$?.href
+  if (coverImagePath) {
+    const packageDirname = Path.dirname(packageDocPath)
+    payload.ebookCoverPath = Path.posix.join(packageDirname, coverImagePath)
+  } else {
+    Logger.warn(`Cover image not found in manifest for epub at "${epubPath}"`)
+  }
+
+  return payload
+}
+module.exports.parse = parse
--- a/server/utils/parsers/parseOpfMetadata.js
+++ b/server/utils/parsers/parseOpfMetadata.js
@ -136,11 +136,7 @@ function stripPrefix(str) {
  return str.split(':').pop()
 }

-module.exports.parseOpfMetadataXML = async (xml) => {
-  const json = await xmlToJSON(xml)
-
-  if (!json) return null
-
+module.exports.parseOpfMetadataJson = (json) => {
  // Handle <package ...> or with prefix <ns0:package ...>
  const packageKey = Object.keys(json).find(key => stripPrefix(key) === 'package')
  if (!packageKey) return null
@ -167,7 +163,7 @@ module.exports.parseOpfMetadataXML = async (xml) => {
  const creators = parseCreators(metadata)
  const authors = (fetchCreators(creators, 'aut') || []).map(au => au?.trim()).filter(au => au)
  const narrators = (fetchNarrators(creators, metadata) || []).map(nrt => nrt?.trim()).filter(nrt => nrt)
-  const data = {
+  return {
    title: fetchTitle(metadata),
    subtitle: fetchSubtitle(metadata),
    authors,
@ -182,5 +178,10 @@ module.exports.parseOpfMetadataXML = async (xml) => {
    series: fetchSeries(metadataMeta),
    tags: fetchTags(metadata)
  }
-  return data
+}
+
+module.exports.parseOpfMetadataXML = async (xml) => {
+  const json = await xmlToJSON(xml)
+  if (!json) return null
+  return this.parseOpfMetadataJson(json)
 }