diff --git a/dist/lib/request.js b/dist/lib/request.js index 7f8ada2..1345f45 100644 --- a/dist/lib/request.js +++ b/dist/lib/request.js @@ -1,6 +1,36 @@ "use strict"; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; Object.defineProperty(exports, "__esModule", { value: true }); const undici_1 = require("undici"); +const iconv_lite_1 = require("iconv-lite"); +const cheerio_1 = require("cheerio"); +const chardet_1 = __importDefault(require("chardet")); +/** + * checks if an element exists + */ +const doesElementExist = (selector, attribute, $) => ($(selector).attr(attribute) && ($(selector).attr(attribute)?.length || 0) > 0); +/** + * gets the charset of the html + */ +function getCharset(body, buffer, $) { + if (doesElementExist('meta', 'charset', $)) { + return $('meta').attr('charset'); + } + if (doesElementExist('head > meta[name="charset"]', 'content', $)) { + return $('head > meta[name="charset"]').attr('content'); + } + if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) { + const content = $('head > meta[http-equiv="content-type"]').attr('content'); + const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i; + return charsetRegEx.test(content) ? charsetRegEx.exec(content)[1] : 'UTF-8'; + } + if (body) { + return chardet_1.default.detect(Buffer.from(buffer)); + } + return 'utf-8'; +} /** * performs the fetch request and formats the body for ogs * @@ -17,7 +47,15 @@ async function requestAndResultsFormatter(options) { headers: { Origin: options.url, Accept: 'text/html' }, ...options.fetchOptions, }); - body = await response.text(); + const bodyArrayBuffer = await response.arrayBuffer(); + const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8'); + const charset = getCharset(bodyText, bodyArrayBuffer, (0, cheerio_1.load)(bodyText)); + if (charset.toLowerCase() === 'utf-8') { + body = bodyText; + } + else { + body = (0, iconv_lite_1.decode)(Buffer.from(bodyArrayBuffer), charset); + } if (response && response.headers && response.headers.get('content-type') && !response.headers.get('content-type')?.includes('text/')) { throw new Error('Page must return a header content-type with text/'); } diff --git a/lib/request.ts b/lib/request.ts index 741bd13..cd526dd 100644 --- a/lib/request.ts +++ b/lib/request.ts @@ -1,6 +1,38 @@ import { fetch } from 'undici'; +import { decode } from 'iconv-lite'; +import { CheerioAPI, load } from 'cheerio'; +import chardet from 'chardet'; import type { OpenGraphScraperOptions } from './types'; +/** + * checks if an element exists + */ +const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => ( + $(selector).attr(attribute) && ($(selector).attr(attribute)?.length || 0) > 0 +); + +/** + * gets the charset of the html + */ +function getCharset(body: string, buffer: Uint8Array, $: CheerioAPI) { + if (doesElementExist('meta', 'charset', $)) { + return $('meta').attr('charset'); + } + if (doesElementExist('head > meta[name="charset"]', 'content', $)) { + return $('head > meta[name="charset"]').attr('content'); + } + if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) { + const content = $('head > meta[http-equiv="content-type"]').attr('content'); + const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i; + return charsetRegEx.test(content) ? charsetRegEx.exec(content)[1] : 'UTF-8'; + } + if (body) { + return chardet.detect(Buffer.from(buffer)); + } + + return 'utf-8'; +} + /** * performs the fetch request and formats the body for ogs * @@ -21,7 +53,14 @@ export default async function requestAndResultsFormatter(options: OpenGraphScrap }, ); - body = await response.text(); + const bodyArrayBuffer = await response.arrayBuffer(); + const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8'); + const charset = getCharset(bodyText, bodyArrayBuffer, load(bodyText)); + if (charset.toLowerCase() === 'utf-8') { + body = bodyText; + } else { + body = decode(Buffer.from(bodyArrayBuffer), charset); + } if (response && response.headers && response.headers.get('content-type') && !response.headers.get('content-type')?.includes('text/')) { throw new Error('Page must return a header content-type with text/'); diff --git a/package-lock.json b/package-lock.json index 2afd79d..8c76e52 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "chardet": "^2.0.0", "cheerio": "^1.0.0-rc.12", + "iconv-lite": "^0.6.3", "undici": "^6.6.2", "validator": "^13.11.0" }, @@ -3134,6 +3135,17 @@ "entities": "^4.4.0" } }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/ignore": { "version": "5.3.1", "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.1.tgz", @@ -4812,6 +4824,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, "node_modules/semver": { "version": "7.6.0", "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz", diff --git a/package.json b/package.json index 2783663..40927ea 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "dependencies": { "chardet": "^2.0.0", "cheerio": "^1.0.0-rc.12", + "iconv-lite": "^0.6.3", "undici": "^6.6.2", "validator": "^13.11.0" }, diff --git a/tests/integration/encoding.spec.ts b/tests/integration/encoding.spec.ts index ec7bf6d..871d085 100644 --- a/tests/integration/encoding.spec.ts +++ b/tests/integration/encoding.spec.ts @@ -6,7 +6,9 @@ describe('encoding', function () { context('should return correct Open Graph Info + charset info', function () { it('rakuten', function () { return ogs({ - url: 'https://jshemas.github.io/openGraphScraperPages/rakuten', + // FIXME temporary use my own page + // url: 'https://jshemas.github.io/openGraphScraperPages/rakuten', + url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/rakuten', }).then(function ({ error, result, response }) { console.log('error:', error); console.log('result:', result); @@ -29,7 +31,9 @@ describe('encoding', function () { expect(result.twitterImage).to.be.eql([{ url: 'https://r.r10s.jp/com/img/home/top/ogp.png', }]); - expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/rakuten'); + // FIXME temporary use my own page + // expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/rakuten'); + expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/rakuten'); expect(result.charset).to.be.eql('euc-jp'); expect(result.success).to.be.eql(true); expect(result).to.have.all.keys( @@ -436,7 +440,9 @@ describe('encoding', function () { }); }); it('tmall', function () { - return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/tmall' }) + // FIXME temporary use my own page + // return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/tmall' }) + return ogs({ url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/tmall' }) .then(function (data) { const { error, result, response } = data; console.log('error:', error); @@ -461,7 +467,9 @@ describe('encoding', function () { ]); expect(result.ogUrl).to.be.eql('https://detail.tmall.com/item.htm?id=605258110430'); expect(result.charset).to.be.eql('gbk'); - expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/tmall'); + // FIXME temporary use my own page + // expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/tmall'); + expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/tmall'); expect(result.success).to.be.eql(true); expect(result).to.have.all.keys( 'favicon', @@ -578,7 +586,9 @@ describe('encoding', function () { }); }); it('abehiroshi', function () { - return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/abehiroshi' }) + // FIXME temporary use my own page + // return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/abehiroshi' }) + return ogs({ url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/abehiroshi' }) .then(function (data) { const { error, result, response } = data; console.log('error:', error); @@ -586,7 +596,9 @@ describe('encoding', function () { expect(error).to.be.eql(false); expect(result.ogTitle).to.be.eql('阿部寛のホームページ'); expect(result.charset).to.be.eql('Shift_JIS'); - expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/abehiroshi'); + // FIXME temporary use my own page + // expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/abehiroshi'); + expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/abehiroshi'); expect(result.success).to.be.eql(true); expect(result).to.have.all.keys( 'charset', diff --git a/tests/unit/openGraphScraper.spec.ts b/tests/unit/openGraphScraper.spec.ts index 94a3b5b..d5f05ea 100644 --- a/tests/unit/openGraphScraper.spec.ts +++ b/tests/unit/openGraphScraper.spec.ts @@ -3,6 +3,7 @@ import sinon from 'sinon'; import chardet from 'chardet'; import { MockAgent, setGlobalDispatcher } from 'undici'; +import { encode } from 'iconv-lite'; import ogs from '../../index'; const basicHTML = ` @@ -734,4 +735,37 @@ describe('return ogs', function () { }); }); }); + + context('when the character encoding is not UTF-8', function () { + it('using just a url', function () { + const html = ` + +
+ + + + + + +