根据路径url和cookie截取指定页面html并将整个页面转成markdown格式输出。

背景

最近接到一个需求,系统迁移,需要将公司wiki通过代码的形式转成markdown的形式然后将对应的markdown在另一个系统中转成html,在另一个系统中显示出来
for me: 需要我将wiki转成markdown的形式,然后供其他同事使用

解决方案

01、传入 {url, cookie}
02、获取 dom, 截取某个id内的url
03、将截取到的html转成xml
04、递归xml节点信息,边遍历边将指定节点信息转成markdown格式,遍历后至节点hasRead为true
05、输出markdownStr

难点

01、如何根据 url和cookie去获取dom
02、wiki获取的dom有混淆,遍历时需要排除已遍历的节点信息
03、行内元素和块级元素的处理
04、对wiki内存在表格和code的元素,截取的html都是table,如何根据信息将表格和code区分开
05、某些dom元素渲染如下<p><strong>确认</strong><p>信息</p><a>链接</a>如下</p> ,这种dom树如何正确转成markdown

对封装的代码解析如下:

  • 入口

1
2
3
4
5
6
"use strict";
const { ConfluenceToMarkdown } = require("lib/index.js");

module.exports = function(options) {
return new ConfluenceToMarkdown(options);
};
  • lib下index入口
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
'use strict';
const _ = require('lodash');
const assert = require('assert');
const { confluence2md } = require('./lib/confluence2md');

class ConfluenceToMarkdown {
constructor(options) {
var options =
arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};
assert.ok(
options.url,
'缺少confluence文件路径----myfe/confluencetomarkdown'
);
assert.ok(
options.cookie,
'缺少当前用户cookie----myfe/confluencetomarkdown'
);

this.url = options.url;
this.cookie = options.cookie;
}
async tomarkdown() {
return new Promise((resolve, reject) => {
if (!this.url || !this.cookie) {
return Promise.resolve({
success: false,
code: 400,
message: '请求参数缺失'
});
}
if (!_.isString(this.url) ||
!_.isString(this.cookie) ||
!new RegExp('https://wiki.maoyan.com/pages').test(this.url)
) {
return Promise.resolve({
success: false,
code: 400,
message: '请求参数格式不正确'
});
}
return confluence2md({ url: this.url, cookie: this.cookie });
});
}
}

module.exports = ConfluenceToMarkdown;
  • confluence2md 文件,利用superagent包将指定url获取dom节点信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
const superagent = require('superagent');
const { html2Xml } = require('./util/html2xml');

const confluence2md = async (url, cookie) => {
return new Promise((resolve, reject) => {
superagent.get(url)
.set('Cookie', cookie)
.end((err, res) => {
if (err) {
console.info('get page error url => ' + url);
return reject({
success: false,
code: 500,
message: 'url转换出错'
});
}
const markdownText = await html2Xml(res.text);
return resolve({
success: true,
code: 200,
message: '成功转换markdown',
data: markdownText
});
});
});
};

module.exports = confluence2md;
  • html2xml, 截取指定id内dom信息,转成xml,递归遍历xml节点信息,转成markdown格式

    依赖和标签

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
const xpath = require('xpath');
const xmlDom = require('xmldom').DOMParser;

const httpUrl = 'https://baidu.com'; // url

let markdownStr = ``;

const IMGSRC = 'data-image-src';
const BASEURL = 'data-base-url';

const tagHtml = [
{key: 'h1', value: '#'},
{key: 'h2', value: '##'},
{key: 'h3', value: '###'},
{key: 'h4', value: '####'},
{key: 'h5', value: '#####'},
{key: 'h6', value: '######'},
{key: 'strong', value: '**'},
{key: 'hr', value: '---'},
{key: 'p', value: ''},
{key: 'span', value: ''},
{key: 'pre', value: ''},
{key: 'br', value: ''},
{key: 'table', value: ''},
{key: 'tr', value: ''},
{key: 'a', value: '[]()'},
{key: 'ul', value: ''},
{key: 'ol', value: ''},
{key: 'img', value: '![]'},
{key: 'br', value: `\n`},
];

const blockLevelTag = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'];
const tableNode = ['tr', 'td', 'th', 'table', 'tbody', 'thead'];
const thdTag = ['td', 'th'];
const lineFlagArr = ['#', '', '|'];
  • html2Xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
const html2Xml = async (str) => {
return new Promise((resolve, reject) => {
let $ = cheerio.load(str);
let contentStr = $('#main-content').html(); // 截取指定id内的dom
const doc = new xmlDom().parseFromString(contentStr); // html转成xml
let length = 0;
// 递归xml,转成md
length = doc.childNodes && doc.childNodes.length ;
for (let nodeIndex = 0; nodeIndex < length; nodeIndex ++) {
deepTraversal(doc.childNodes[nodeIndex]);
}
console.info(markdownStr);
return markdownStr;
});
};
  • deepTraversal 递归时需注意行内元素和块级元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
const deepTraversal = (node) => {
let nodes = [];
/**
* 过滤目录
* 过滤code中toolbar div
* */
const marcDom = new xmlDom().parseFromString(node.toString());
const marcNode = xpath.select1("/div/@class", marcDom);
if ((marcNode !== undefined && marcNode.value.indexOf('toc') > -1) || (marcNode !== undefined && marcNode.value === 'toolbar')) {
return;
}

if (node !== null) {
nodes.push(node);
let children = node.childNodes;
const markdown = xml2Md(node);
const currentTagFlag = blockLevelTag.indexOf(node.nodeName) > -1;
const parentTagFlag = blockLevelTag.indexOf(node.parentNode.nodeName) > -1;
// 块级元素整体换行
if (lineFlagArr.indexOf(markdown.trim().substr(-1)) > -1 || (!currentTagFlag && parentTagFlag)) {
markdownStr += `${markdown}`;
} else if (markdownStr.trim().substr(-1) === '*' && markdownStr.trim().substr(-2) !== '**'){
markdownStr += ` ${markdown}\n`;
} else {
markdownStr += `${markdown}\n`;
}
if (children) {
for (let i = 0; i < children.length; i++ ) {
deepTraversal(children[i]);
}
}
}
};
  • xml2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
const xml2Md = (node) => {
let mds = ``;

tagHtml.filter((item) => {
const nodeName = node.nodeName;
if (item.key === nodeName) {
let md = ``;
switch(item.key) {
case 'img':
md = img2Md(node, item);
break;
case 'a':
md = a2Md(node, item);
break;
case 'table':
md = table2Md(node) + `\n`;
break;
case 'strong':
md = strong2Md(node, item);
break;
case 'p':
md = p2Md(node, item) + `\n`;
break;
case 'ol':
md = ul2Md(node, item);
break;
case 'ul':
md = ul2Md(node, item);
break;
case 'pre':
md = code2Md(node, item);
break;
default:
md = default2Md(node, item);
break;
}
if (blockLevelTag.indexOf(node.nodeName) > -1) mds += `\n`;
mds += md;
}
});
mds.replace(`\n`, '');
return mds;
};
  • default2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
const default2Md = (node, item) => {
// 已读节点不再读取
if (node.hasRead) return '';

/**
* 过滤由table带过来的遍历
*/
const parentDom = new xmlDom().parseFromString(node.parentNode.toString());
const parentClass = xpath.select1("//@class", parentDom) && xpath.select1("//@class", parentDom).nodeValue;
if (parentClass && parentClass.indexOf('table') > -1) return '';
if (tableNode.indexOf(node.parentNode.nodeName) > -1) return '';

const tableFlag = filterTable(node);
let md = ``;
if (tableFlag) return '';

const blockFlag = blockLevelTag.indexOf(node.nodeName) > -1;
// 主体
if (blockFlag) {
md += blockFlag2Md(node, item);
} else {
let value = ``;
for (let dIndex = 0; dIndex < node.childNodes.length; dIndex ++) {
const ccnode = node.childNodes[dIndex];
if (node.childNodes[dIndex] && node.childNodes[dIndex].hasRead) return;
if (node.childNodes[dIndex] && node.childNodes[dIndex].nodeName === '#text') value += node.childNodes[dIndex].data;
value += xml2Md(ccnode && node.childNodes[dIndex]);
}
md = value === '' ? `${item.value} ` : blockFlag ? `${item.value} ${value}\n` : `${item.value} ${value}`;
}
node.hasRead = true;
// md += `\n`;
return md;
}
  • blockFlag2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
const blockFlag2Md = (node, item) => {
if (node.hasRead) return '';

let md = `${item.value} `;
for (let index = 0; index < node.childNodes.length; index ++) {
if (node.childNodes[index].hasRead) return;
if (node.childNodes[index].nodeName === '#text') md += node.childNodes[index].data;
md += xml2Md(node.childNodes[index]);
node.childNodes[index].hasRead = true;
}
node.hasRead = true;
return md;
}
  • img2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
const img2Md = (node, item) => {
if (node.hasRead) return '';

const tableFlag = filterTable(node);
if (tableFlag) return '';

let attrLen = node.attributes.length;
let imgBase = '', imgSour = '', imgSrc = '';
for (let attrIndex = 0; attrIndex < attrLen; attrIndex ++) {
const attrItem = node.attributes[attrIndex];
if (attrItem.nodeName === IMGSRC) imgSour = attrItem.nodeValue;
if (attrItem.nodeName === BASEURL) imgBase = attrItem.nodeValue;
imgSrc = imgBase + imgSour;
}
node.hasRead = true;
return `${item.value}(${imgSrc})`;
}
  • a2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
const a2Md = (node, item) => {
if (node.hasRead) return '';


const tableFlag = filterTable(node);
if (tableFlag) return '';

let alink = '';
const aNode = new xmlDom().parseFromString(node.toString());
const hrefpath = xpath.select1('/a/@href', aNode);
for (let aIndex = 0; aIndex < node.attributes.length; aIndex ++) {
const aItem = node.attributes[aIndex];
if (aItem.nodeName === 'href') {
alink = aItem.nodeValue;
}
}
if (alink.substr(0,1) === '/') alink = `${httpUrl}${alink}`;
alink = `[${node.childNodes[0].data}](${alink})`;
node.hasRead = true;
return`${alink}`;
}
  • strong2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
const strong2Md = (node, item) => {
if (node.hasRead) return '';

const tableFlag = filterTable(node);
if (tableFlag) return '';

let md = ``;
const currntBlockFlag = node.nextSibling;
const parentSubFlag = blockLevelTag.indexOf(node.parentNode.nodeName) <= -1 && node.parentNode.nextSibling;

let value = ` **`;
for (let sIndex = 0; sIndex < node.childNodes.length; sIndex ++) {
if (node.childNodes[sIndex] && node.childNodes[sIndex].hasRead) return;
if (node.childNodes[sIndex] && node.childNodes[sIndex].nodeName === '#text') value += node.childNodes[sIndex].data.trim() || '';
value += xml2Md(node.childNodes[sIndex]).trim();
node.childNodes[sIndex].hasRead = true;
}
md += value === '**' ? '' : parentSubFlag || currntBlockFlag ? `${value}** ` : `${value}** \n` ;
node.hasRead = true;
return md;
}
  • p2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
const p2Md = (node, item) => {
// 过滤已读
if (node.hasRead) return '';

// 过滤table
const tableFlag = filterTable(node);
if (tableFlag) return '';


const nodeChild = node.childNodes;
let md = ``;
for (let index = 0; index < nodeChild.length; index ++) {
if (nodeChild[index].hasRead) return;
if (nodeChild[index].nodeName === '#text') md += nodeChild[index].data;
md += xml2Md(nodeChild[index]);
nodeChild[index].hasRead = true;
}

node.hasRead = true;
return md;
}
  • ul2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
const ul2Md = (node, item) => {
if (node.hasRead) return '';

// 过滤table
const tableFlag = filterTable(node);
if (tableFlag) return '';

let key = `*`, ulMd = ``, liFlag = false; // 默认为无序列表
if (item.key === 'ol') key = [];
if (node.parentNode.nodeName === 'li') {
liFlag = true;
}
for (let index = 0; index < node.childNodes.length; index ++) {
if (node.childNodes[index].hasRead) return;
if (ulMd.substr(-1) === '*') ulMd = ulMd.trim();
ulMd += `${key === '*' ? `${liFlag ? `\n `: ''}*` : `${liFlag ? `\n `: ''}${index + 1}、`} ${li2Md(node && node.childNodes && node.childNodes[index], key)} \n`;
node.childNodes[index].hasRead = true;
}
node.hasRead = true;
return ulMd;
}
  • li2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
const li2Md = (node, key) => {
if (node.hasRead) return '';

// 过滤table
const tableFlag = filterTable(node);
if (tableFlag) return '';

let limd = ``;
for (let index = 0; index < node.childNodes.length; index ++) {
if(node.childNodes[index].hasRead) return;

if (node.childNodes[index].nodeName === '#text') {
if (limd.substr('*')) limd = limd.trim();
limd += `${node.childNodes[index].data}`
} else {
if (limd.substr('*')) limd = limd.trim();
limd += xml2Md(node.childNodes[index]);
}

node.childNodes[index].hasRead = true;
}

node.hasRead = true;
return limd;
}
  • table2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
const table2Md = (node) => {
if (node.hasRead) return '';
let nodestr = node.toString();

// table元素闭合问题
nodestr = nodestr.replace('</table>', '');
nodestr = nodestr.replace('</tbody>', '</tbody></table>');
nodestr = nodestr.substr(0, nodestr.indexOf('</table>')) + '</table>';

let tbodymd = `${tbody2Md(nodestr)}` || ``;
node.hasRead = true;
return tbodymd;
}
  • tbody2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
const tbody2Md = (nodestr) => {
const node = new xmlDom().parseFromString(nodestr);
if (node.hasRead) return '';
const tableClass = xpath.select1("/table/@class", node) && xpath.select1("/table/@class", node) && xpath.select1("/table/@class", node).value;

let tbodymd = ``;
if (tableClass && tableClass.indexOf('confluenceTable') > -1) {
// table
tbodymd += `\n ${nodestr} \n`;
node.hasRead = true;
} else {
// code
const trDom = new xmlDom().parseFromString(nodestr);
const trxpath = xpath.select('//tr', trDom);
for (let trIndex = 0; trIndex < trxpath.length; trIndex ++) {
tbodymd += codeTrMd(trxpath[trIndex]);
trxpath[trIndex].hasRead = true;
}
}
node.hasRead = true;
return tbodymd;
}
  • codeTrMd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// code to markdown
const codeTrMd = (node) => {
if (node.hasRead) return '';

const tdDom = new xmlDom().parseFromString(node.toString());
const tdpath = xpath.select("//td/div/div", tdDom);
let codemd = '\n' + '```' + `\n`;
for(let codeIndex = 0; codeIndex < tdpath.length; codeIndex ++) {
const codeDom = new xmlDom().parseFromString(tdpath[codeIndex].toString());
let mdValue = xpath.select("//code/text()", codeDom);

for (let mdValueIndex = 0; mdValueIndex < mdValue.length; mdValueIndex ++) {
codemd += mdValue[mdValueIndex].data;
}
codemd += `\n`;
}
codemd += '```\n';
return codemd;
}
  • filterTable
1
2
3
4
5
6
7
8
const filterTable = (node) => {
const tdNode = findtd(node);
if (!tdNode) return false;
const filClass = xpath.select1('//td/@class', tdNode);
const thClass = xpath.select1('//th/@class', tdNode);
if (filClass && (filClass.value.indexOf('confluenceT') > -1 || thClass.value.indexOf('confluenceT') > -1)) return true;
return false;
}
  • findtd
1
2
3
4
5
6
7
8
9
10
11
12
const findtd = (node) => {
const parent_node = node.parentNode;
const pppnode = parent_node && parent_node.parentNode;
const forthnode = pppnode && pppnode.parentNode;
const fifthnode = forthnode && forthnode.parentNode;
if (node && thdTag.indexOf(node.nodeName) > -1) return node;
if (parent_node && thdTag.indexOf(parent_node.nodeName) > -1) return parent_node;
if (pppnode && thdTag.indexOf(pppnode.nodeName) > -1) return pppnode;
if (forthnode && thdTag.indexOf(forthnode.nodeName) > -1) return forthnode;
if (fifthnode && thdTag.indexOf(fifthnode.nodeName) > -1) return fifthnode;
return false;
}
  • code2Md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// pre tag code to markdown
const code2Md = (node) => {

// console.info(node.toString());
if (node.hasRead) return '';

// filter code tag from table
const tableFlag = filterTable(node);
if (tableFlag) return '';


const codeDom = new xmlDom().parseFromString(node.toString());
const nodepath = xpath.select1("/pre/text()",codeDom);
const nodeClass = xpath.select1("/pre/@class", codeDom);
if(!nodeClass || nodeClass === 'undefined' || (nodeClass && nodeClass.nodeValue).indexOf('highlighter') <= -1) return nodepath && nodepath.nodeValue || '';
const lanagepath = xpath.select1("/pre/@data-syntaxhighlighter-params", codeDom);


let md = '```';
let lanage = lanagepath && lanagepath.nodeValue && lanagepath.nodeValue.split(';') && lanagepath.nodeValue.split(';')[0].split(':')[1] || '';
md += `${lanage} \n ${nodepath && nodepath.nodeValue || ''} \n`;
md += '```';
node.hasRead = true;
return md;
}

github地址