src/class/WeChatArticle.js

import { fetchAndParse } from '../utils'
import { collectMedia } from '../lib/media'
import moment from 'moment'
import { VM } from 'vm2'

/** @external {$} https://cheerio.js.org/ */

/**
 * This class is for a typical WeChat article. Takes an URL to that article.
 * e.g. http://mp.weixin.qq.com/s/5NxzEg0N18v-AuOB_RmSDw
 *
 * In most cases you don't really need to use this class. Use {@link NormalizedWeChatArticle}
 * if you need normalization on articles.
 *
 * All content-based properties are only available after {@link WeChatArticle#fetchAndParse}.
 *
 * @example
 * import { WeChatArticle } from 'wearticle'
 * const article = new WeChatArticle('http://mp.weixin.qq.com/s/5NxzEg0N18v-AuOB_RmSDw')
 * await article.fetchAndParse()
 * console.log(article.toString())
 * // -> WeChatArticle ("关于 ¡Hola!" by "Holateens")
 *
 * @since 0.1.0
 */
export default class WeChatArticle {
  /**
   * Take the url. Just note the article won't be actually loaded and parsed
   * before you call {@link WeChatArticle#fetchAndParse} manually.
   *
   * You could see if this article is parsed or not, by using {@link WeChatArticle#isParsed}.
   *
   * @param {String} url - A valid WeChat article URL, begins with
   * `http(s)://mp.weixin.qq.com/s`
   */
  constructor (url) {
    if (!url.match(/https*:\/\/mp\.weixin\.qq\.com\/s[/?]/ig)) {
      throw new Error(url + ' is not a WeChat article url')
    }

    /** @type {String} */
    this.url = url

    this._parsed = false
  }

  /**
   * See if this article has been loaded & parsed.
   *
   * Load & parse with {@link WeChatArticle#fetchAndParse}.
   *
   * @return {Boolean} Parsed or not.
   */
  isParsed () { return this._parsed }

  /**
   * Fetch & parse the article.
   *
   * @return {WeChatArticle} Returns `this` so you can chain calls.
   */
  async fetchAndParse () {
    const $ = await fetchAndParse(this.url)

    /** @type {$} */
    this._$ = $

    let usefulCode = 'const window = {}; const __getInfoFunc = () => {'
    usefulCode += $('#activity-detail > script:nth-child(7)').get()[0].children[0].data
    usefulCode += `
        return {
          copyrightStat: copyright_stat,
          account: {
            id: user_name,
            name: nickname,
            avatarUrl: round_head_img
          },
          title: msg_title,
          description: msg_desc,
          sourceUrl: msg_source_url,
          headPicUrl: msg_cdn_url
        };
      };
      __getInfoFunc();
    `
    const infoObj = (new VM()).run(usefulCode)

    /** @type {String} */
    this.title = infoObj.title

    /** @type {Date} */
    this.publishedAt = moment($('#post-date').text().trim()).toDate()
    const authorEl = $('#meta_content > em:nth-child(2)')

    /**
     * The author. Only exists when it does have an author field.
     *
     * To get a must-have author name, use {@link WeChatArticle#getAuthorName}.
     * @type {String}
     */
    this.author = (authorEl.text() !== $('#post-date').text())
      ? authorEl.text().trim()
      : undefined
    this.account = infoObj.account

    /**
     * The original content fetched from WeChat public platform. Actually from
     * `#js_content` part of the whole raw HTML.
     * @type {String}
     */
    this.content = $('#js_content').html()

    this.sourceUrl = infoObj.sourceUrl
    this.headPicUrl = infoObj.headPicUrl
    this.description = infoObj.description
    this._parsed = true
    return this
  }

  /**
   * Returns a cheerio $ of the content.
   *
   * To tell the truth, this $ is not a fully functional $. It's actually a
   * function delegates cheerio instance in some way. You can only use `$(selector)`
   * there, and even can't use a context.
   *
   * @return {$} The cheerio $.
   * @since 0.2.0
   */
  getContent$ () { return selector => this._$(selector, '#js_content') }

  /**
   * Update content with your modified `$`. Don't forget to do this.
   *
   * @since 0.4.0
   */
  updateContent () { this.content = this._$('#js_content').html() }

  /**
   * Get an array of media in the article. Useful when you need to deal with it.
   * An empty array will be returned in case there is no media.
   *
   * Currently we only parse images, so it will be an array of {@link ArticleImage}.
   * See its documentation for reference.
   *
   * Parse media only when you try to get them.
   *
   * @return {Array}
   *
   * @since 0.4.0
   */
  getMedia () {
    if (!this._media) this._media = collectMedia(this._$)
    return this._media
  }

  /**
   * Returns author's name if the author does specify an author name on the
   * article, or the name of the Official Account if not.
   *
   * @return {String} The author's name.
   */
  getAuthorName () { return this.author || this.account.name }
  toString () {
    return this._parsed
      ? `WeChatArticle ("${this.title}" by "${this.getAuthorName()}")`
      : `WeChatArticle (unparsed ${this.url})`
  }
}