mammoth.tests.js 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. var assert = require("assert");
  2. var path = require("path");
  3. var fs = require("fs");
  4. var _ = require("underscore");
  5. var mammoth = require("../");
  6. var promises = require("../lib/promises");
  7. var results = require("../lib/results");
  8. var testing = require("./testing");
  9. var test = require("./test")(module);
  10. var testData = testing.testData;
  11. var createFakeDocxFile = testing.createFakeDocxFile;
  12. test('should convert docx containing one paragraph to single p element', function() {
  13. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  14. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  15. assert.equal(result.value, "<p>Walking on imported air</p>");
  16. assert.deepEqual(result.messages, []);
  17. });
  18. });
  19. test('should convert docx represented by a Buffer', function() {
  20. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  21. return promises.nfcall(fs.readFile, docxPath)
  22. .then(function(buffer) {
  23. return mammoth.convertToHtml({buffer: buffer});
  24. })
  25. .then(function(result) {
  26. assert.equal(result.value, "<p>Walking on imported air</p>");
  27. assert.deepEqual(result.messages, []);
  28. });
  29. });
  30. test('should read docx xml files with unicode byte order mark', function() {
  31. var docxPath = path.join(__dirname, "test-data/utf8-bom.docx");
  32. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  33. assert.equal(result.value, "<p>This XML has a byte order mark.</p>");
  34. assert.deepEqual(result.messages, []);
  35. });
  36. });
  37. test('empty paragraphs are ignored by default', function() {
  38. var docxPath = path.join(__dirname, "test-data/empty.docx");
  39. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  40. assert.equal(result.value, "");
  41. assert.deepEqual(result.messages, []);
  42. });
  43. });
  44. test('empty paragraphs are preserved if ignoreEmptyParagraphs is false', function() {
  45. var docxPath = path.join(__dirname, "test-data/empty.docx");
  46. return mammoth.convertToHtml({path: docxPath}, {ignoreEmptyParagraphs: false}).then(function(result) {
  47. assert.equal(result.value, "<p></p>");
  48. assert.deepEqual(result.messages, []);
  49. });
  50. });
  51. test('style map can be expressed as string', function() {
  52. var docxFile = createFakeDocxFile({
  53. "word/document.xml": testData("simple/word/document.xml")
  54. });
  55. var options = {
  56. styleMap: "p => h1"
  57. };
  58. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  59. assert.equal("<h1>Hello.</h1>", result.value);
  60. });
  61. });
  62. test('style map can be expressed as array of style mappings', function() {
  63. var docxFile = createFakeDocxFile({
  64. "word/document.xml": testData("simple/word/document.xml")
  65. });
  66. var options = {
  67. styleMap: ["p => h1"]
  68. };
  69. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  70. assert.equal("<h1>Hello.</h1>", result.value);
  71. });
  72. });
  73. test('embedded style map is used if present', function() {
  74. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  75. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  76. assert.equal(result.value, "<h1>Walking on imported air</h1>");
  77. assert.deepEqual(result.messages, []);
  78. });
  79. });
  80. test('explicit style map takes precedence over embedded style map', function() {
  81. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  82. var options = {
  83. styleMap: ["p => p"]
  84. };
  85. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  86. assert.equal(result.value, "<p>Walking on imported air</p>");
  87. assert.deepEqual(result.messages, []);
  88. });
  89. });
  90. test('explicit style map is combined with embedded style map', function() {
  91. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  92. var options = {
  93. styleMap: ["r => strong"]
  94. };
  95. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  96. assert.equal(result.value, "<h1><strong>Walking on imported air</strong></h1>");
  97. assert.deepEqual(result.messages, []);
  98. });
  99. });
  100. test('embedded style maps can be disabled', function() {
  101. var docxPath = path.join(__dirname, "test-data/embedded-style-map.docx");
  102. var options = {
  103. includeEmbeddedStyleMap: false
  104. };
  105. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  106. assert.equal(result.value, "<p>Walking on imported air</p>");
  107. assert.deepEqual(result.messages, []);
  108. });
  109. });
  110. test('embedded style map can be written and then read', function() {
  111. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  112. return promises.nfcall(fs.readFile, docxPath)
  113. .then(function(buffer) {
  114. return mammoth.embedStyleMap({buffer: buffer}, "p => h1");
  115. })
  116. .then(function(docx) {
  117. return mammoth.convertToHtml({buffer: docx.toBuffer()});
  118. })
  119. .then(function(result) {
  120. assert.equal(result.value, "<h1>Walking on imported air</h1>");
  121. assert.deepEqual(result.messages, []);
  122. });
  123. });
  124. test('embedded style map can be retrieved', function() {
  125. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  126. return promises.nfcall(fs.readFile, docxPath)
  127. .then(function(buffer) {
  128. return mammoth.embedStyleMap({buffer: buffer}, "p => h1");
  129. })
  130. .then(function(docx) {
  131. return mammoth.readEmbeddedStyleMap({buffer: docx.toBuffer()});
  132. })
  133. .then(function(styleMap) {
  134. assert.equal(styleMap, "p => h1");
  135. });
  136. });
  137. test('warning if style mapping is not understood', function() {
  138. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  139. var options = {
  140. styleMap: "????\np => h1"
  141. };
  142. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  143. assert.equal("<h1>Walking on imported air</h1>", result.value);
  144. var warning = "Did not understand this style mapping, so ignored it: ????\n" +
  145. 'Error was at character number 1: Expected element type but got unrecognisedCharacter "?"';
  146. assert.deepEqual(result.messages, [results.warning(warning)]);
  147. });
  148. });
  149. test('options are passed to document converter when calling mammoth.convertToHtml', function() {
  150. var docxFile = createFakeDocxFile({
  151. "word/document.xml": testData("simple/word/document.xml")
  152. });
  153. var options = {
  154. styleMap: "p => h1"
  155. };
  156. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  157. assert.equal("<h1>Hello.</h1>", result.value);
  158. });
  159. });
  160. test('options.transformDocument is used to transform document if set', function() {
  161. var docxFile = createFakeDocxFile({
  162. "word/document.xml": testData("simple/word/document.xml")
  163. });
  164. var options = {
  165. transformDocument: function(document) {
  166. document.children[0].styleId = "Heading1";
  167. return document;
  168. }
  169. };
  170. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  171. assert.equal("<h1>Hello.</h1>", result.value);
  172. });
  173. });
  174. test('mammoth.transforms.paragraph only transforms paragraphs', function() {
  175. var docxFile = createFakeDocxFile({
  176. "word/document.xml": testData("simple/word/document.xml")
  177. });
  178. var options = {
  179. transformDocument: mammoth.transforms.paragraph(function(paragraph) {
  180. return _.extend(paragraph, {styleId: "Heading1"});
  181. })
  182. };
  183. return mammoth.convertToHtml({file: docxFile}, options).then(function(result) {
  184. assert.equal("<h1>Hello.</h1>", result.value);
  185. });
  186. });
  187. test('inline images referenced by path relative to part are included in output', function() {
  188. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  189. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  190. assert.equal(result.value, '<p><img src="" /></p>');
  191. });
  192. });
  193. test('inline images referenced by path relative to base are included in output', function() {
  194. var docxPath = path.join(__dirname, "test-data/tiny-picture-target-base-relative.docx");
  195. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  196. assert.equal(result.value, '<p><img src="" /></p>');
  197. });
  198. });
  199. test('src of inline images can be changed', function() {
  200. var docxPath = path.join(__dirname, "test-data/tiny-picture.docx");
  201. var convertImage = mammoth.images.imgElement(function(element) {
  202. return element.read("base64").then(function(encodedImage) {
  203. return {src: encodedImage.substring(0, 2) + "," + element.contentType};
  204. });
  205. });
  206. return mammoth.convertToHtml({path: docxPath}, {convertImage: convertImage}).then(function(result) {
  207. assert.equal(result.value, '<p><img src="iV,image/png" /></p>');
  208. });
  209. });
  210. test('images stored outside of document are included in output', function() {
  211. var docxPath = path.join(__dirname, "test-data/external-picture.docx");
  212. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  213. assert.equal(result.value, '<p><img src="" /></p>');
  214. assert.deepEqual(result.messages, []);
  215. });
  216. });
  217. test('error if images stored outside of document are specified when passing file without path', function() {
  218. var docxPath = path.join(__dirname, "test-data/external-picture.docx");
  219. var buffer = fs.readFileSync(docxPath);
  220. return mammoth.convertToHtml({buffer: buffer}).then(function(result) {
  221. assert.equal(result.value, '');
  222. assert.equal(result.messages[0].message, "could not find external image 'tiny-picture.png', path of input document is unknown");
  223. assert.equal(result.messages[0].type, "error");
  224. });
  225. });
  226. test('simple list is converted to list elements', function() {
  227. var docxPath = path.join(__dirname, "test-data/simple-list.docx");
  228. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  229. assert.equal(result.value, '<ul><li>Apple</li><li>Banana</li></ul>');
  230. });
  231. });
  232. test('word tables are converted to html tables', function() {
  233. var docxPath = path.join(__dirname, "test-data/tables.docx");
  234. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  235. var expectedHtml = "<p>Above</p>" +
  236. "<table>" +
  237. "<tr><td><p>Top left</p></td><td><p>Top right</p></td></tr>" +
  238. "<tr><td><p>Bottom left</p></td><td><p>Bottom right</p></td></tr>" +
  239. "</table>" +
  240. "<p>Below</p>";
  241. assert.equal(result.value, expectedHtml);
  242. assert.deepEqual(result.messages, []);
  243. });
  244. });
  245. test('footnotes are appended to text', function() {
  246. // TODO: don't duplicate footnotes with multiple references
  247. var docxPath = path.join(__dirname, "test-data/footnotes.docx");
  248. var options = {
  249. idPrefix: "doc-42-"
  250. };
  251. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  252. var expectedOutput = '<p>Ouch' +
  253. '<sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup>.' +
  254. '<sup><a href="#doc-42-footnote-2" id="doc-42-footnote-ref-2">[2]</a></sup></p>' +
  255. '<ol><li id="doc-42-footnote-1"><p> A tachyon walks into a bar. <a href="#doc-42-footnote-ref-1">↑</a></p></li>' +
  256. '<li id="doc-42-footnote-2"><p> Fin. <a href="#doc-42-footnote-ref-2">↑</a></p></li></ol>';
  257. assert.equal(result.value, expectedOutput);
  258. assert.deepEqual(result.messages, []);
  259. });
  260. });
  261. test('endnotes are appended to text', function() {
  262. var docxPath = path.join(__dirname, "test-data/endnotes.docx");
  263. var options = {
  264. idPrefix: "doc-42-"
  265. };
  266. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  267. var expectedOutput = '<p>Ouch' +
  268. '<sup><a href="#doc-42-endnote-2" id="doc-42-endnote-ref-2">[1]</a></sup>.' +
  269. '<sup><a href="#doc-42-endnote-3" id="doc-42-endnote-ref-3">[2]</a></sup></p>' +
  270. '<ol><li id="doc-42-endnote-2"><p> A tachyon walks into a bar. <a href="#doc-42-endnote-ref-2">↑</a></p></li>' +
  271. '<li id="doc-42-endnote-3"><p> Fin. <a href="#doc-42-endnote-ref-3">↑</a></p></li></ol>';
  272. assert.equal(result.value, expectedOutput);
  273. assert.deepEqual(result.messages, []);
  274. });
  275. });
  276. test('relationships are handled properly in footnotes', function() {
  277. var docxPath = path.join(__dirname, "test-data/footnote-hyperlink.docx");
  278. var options = {
  279. idPrefix: "doc-42-"
  280. };
  281. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  282. var expectedOutput =
  283. '<p><sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup></p>' +
  284. '<ol><li id="doc-42-footnote-1"><p> <a href="http://www.example.com">Example</a> <a href="#doc-42-footnote-ref-1">↑</a></p></li></ol>';
  285. assert.equal(result.value, expectedOutput);
  286. assert.deepEqual(result.messages, []);
  287. });
  288. });
  289. test('when style mapping is defined for comment references then comments are included', function() {
  290. var docxPath = path.join(__dirname, "test-data/comments.docx");
  291. var options = {
  292. idPrefix: "doc-42-",
  293. styleMap: "comment-reference => sup"
  294. };
  295. return mammoth.convertToHtml({path: docxPath}, options).then(function(result) {
  296. var expectedOutput = (
  297. '<p>Ouch' +
  298. '<sup><a href="#doc-42-comment-0" id="doc-42-comment-ref-0">[MW1]</a></sup>.' +
  299. '<sup><a href="#doc-42-comment-2" id="doc-42-comment-ref-2">[MW2]</a></sup></p>' +
  300. '<dl><dt id="doc-42-comment-0">Comment [MW1]</dt><dd><p>A tachyon walks into a bar. <a href="#doc-42-comment-ref-0">↑</a></p></dd>' +
  301. '<dt id="doc-42-comment-2">Comment [MW2]</dt><dd><p>Fin. <a href="#doc-42-comment-ref-2">↑</a></p></dd></dl>'
  302. );
  303. assert.equal(result.value, expectedOutput);
  304. assert.deepEqual(result.messages, []);
  305. });
  306. });
  307. test('textboxes are read', function() {
  308. var docxPath = path.join(__dirname, "test-data/text-box.docx");
  309. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  310. var expectedOutput = '<p>Datum plane</p>';
  311. assert.equal(result.value, expectedOutput);
  312. });
  313. });
  314. test('underline is ignored by default', function() {
  315. var docxPath = path.join(__dirname, "test-data/underline.docx");
  316. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  317. assert.equal(result.value, '<p><strong>The Sunset Tree</strong></p>');
  318. });
  319. });
  320. test('underline can be configured with style mapping', function() {
  321. var docxPath = path.join(__dirname, "test-data/underline.docx");
  322. return mammoth.convertToHtml({path: docxPath}, {styleMap: "u => em"}).then(function(result) {
  323. assert.equal(result.value, '<p><strong>The <em>Sunset</em> Tree</strong></p>');
  324. });
  325. });
  326. test('strikethrough is converted to <s> by default', function() {
  327. var docxPath = path.join(__dirname, "test-data/strikethrough.docx");
  328. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  329. assert.equal(result.value, "<p><s>Today's Special: Salmon</s> Sold out</p>");
  330. });
  331. });
  332. test('strikethrough conversion can be configured with style mappings', function() {
  333. var docxPath = path.join(__dirname, "test-data/strikethrough.docx");
  334. return mammoth.convertToHtml({path: docxPath}, {styleMap: "strike => del"}).then(function(result) {
  335. assert.equal(result.value, "<p><del>Today's Special: Salmon</del> Sold out</p>");
  336. });
  337. });
  338. test('indentation is used if prettyPrint is true', function() {
  339. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  340. return mammoth.convertToHtml({path: docxPath}, {prettyPrint: true}).then(function(result) {
  341. assert.equal(result.value, "<p>\n Walking on imported air\n</p>");
  342. assert.deepEqual(result.messages, []);
  343. });
  344. });
  345. test('using styleMapping throws error', function() {
  346. try {
  347. mammoth.styleMapping();
  348. } catch (error) {
  349. assert.equal(
  350. error.message,
  351. 'Use a raw string instead of mammoth.styleMapping e.g. "p[style-name=\'Title\'] => h1" instead of mammoth.styleMapping("p[style-name=\'Title\'] => h1")'
  352. );
  353. }
  354. });
  355. test('can convert single paragraph to markdown', function() {
  356. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  357. return mammoth.convertToMarkdown({path: docxPath}).then(function(result) {
  358. assert.equal(result.value, "Walking on imported air\n\n");
  359. assert.deepEqual(result.messages, []);
  360. });
  361. });
  362. test('extractRawText only retains raw text', function() {
  363. var docxPath = path.join(__dirname, "test-data/simple-list.docx");
  364. return mammoth.extractRawText({path: docxPath}).then(function(result) {
  365. assert.equal(result.value, 'Apple\n\nBanana\n\n');
  366. });
  367. });
  368. test('extractRawText can use .docx files represented by a Buffer', function() {
  369. var docxPath = path.join(__dirname, "test-data/single-paragraph.docx");
  370. return promises.nfcall(fs.readFile, docxPath)
  371. .then(function(buffer) {
  372. return mammoth.extractRawText({buffer: buffer});
  373. })
  374. .then(function(result) {
  375. assert.equal(result.value, "Walking on imported air\n\n");
  376. assert.deepEqual(result.messages, []);
  377. });
  378. });
  379. test('should throw error if file is not a valid docx document', function() {
  380. var docxPath = path.join(__dirname, "test-data/empty.zip");
  381. return mammoth.convertToHtml({path: docxPath}).then(function(result) {
  382. assert.ok(false, "Expected error");
  383. }, function(error) {
  384. assert.equal(error.message, "Could not find main document part. Are you sure this is a valid .docx file?");
  385. });
  386. });