Merge pull request #20381 from Yoast/158-tokenize-sentences-in-japanese

158 tokenize sentences in japanese
Yoast · Jun 13, 2023 · 200f811 · 200f811
2 parents ef27594 + 61c4e98
commit 200f811
Show file tree

Hide file tree

Showing 8 changed files with 253 additions and 4 deletions.
diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/splitIntoTokensCustomSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/splitIntoTokensCustomSpec.js
@@ -0,0 +1,57 @@
+import splitIntoTokensCustom from "../../../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom";
+
+const testcases = [
+	{
+		description: "should return an empty result sentence is empty",
+		sentence: { text: "", sourceCodeRange: { startOffset: 0, endOffset: 0 } },
+		expected: [],
+	},
+	{
+		description: "should correctly tokenize a simple Japanese sentence wouthout punctuations",
+		sentence: {
+			text: "犬が大好き",
+			sourceCodeRange: { startOffset: 0, endOffset: 5 },
+		},
+		expected: [ "犬", "が", "大好き" ],
+	},
+	{
+		description: "should correctly tokenize a Japanese sentence with japanese punctuations",
+		sentence: {
+			text: "犬が大好き\u3002",
+			sourceCodeRange: { startOffset: 0, endOffset: 6 },
+		},
+		expected: [ "犬", "が", "大好き", "。" ],
+
+	},
+	{
+		description: "should correctly tokenize a Japanese sentence with english punctuations",
+		sentence: {
+			text: "犬が大好き.",
+			sourceCodeRange: { startOffset: 0, endOffset: 6 },
+		},
+		expected: [ "犬", "が", "大好き", "." ],
+	},
+	{
+		description: "should correctly tokenize a Japanese sentence with quotation marks inside",
+		sentence: {
+			text: "犬「が」大好き\u3002",
+			sourceCodeRange: { startOffset: 0, endOffset: 8 },
+		},
+		expected: [ "犬", "「", "が", "」", "大好き", "。" ],
+	},
+	{
+		description: "should correctly tokenize a Japanese sentence with quotation marks around",
+		sentence: {
+			text: "『犬が大好き\u3002』",
+			sourceCodeRange: { startOffset: 0, endOffset: 8 },
+		},
+		expected: [ "『", "犬", "が", "大好き", "。", "』" ],
+	},
+];
+
+describe.each( testcases )( "splitIntoTokensCustom for Japanese: %p", ( { description, sentence, expected } ) => {
+	it( description, () => {
+		const tokens = splitIntoTokensCustom( sentence );
+		expect( tokens ).toEqual( expected );
+	} );
+} );
diff --git a/packages/yoastseo/spec/parse/build/buildSpec.js b/packages/yoastseo/spec/parse/build/buildSpec.js
@@ -2,6 +2,7 @@ import build from "../../../src/parse/build/build";
 import LanguageProcessor from "../../../src/parse/language/LanguageProcessor";
 import Factory from "../../specHelpers/factory";
 import memoizedSentenceTokenizer from "../../../src/languageProcessing/helpers/sentence/memoizedSentenceTokenizer";
+import splitIntoTokensCustom from "../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom";
 
 describe( "The parse function", () => {
 	it( "parses a basic HTML text", () => {
@@ -67,6 +68,66 @@ describe( "The parse function", () => {
 		} );
 	} );
 
+	it( "parses a basic Japanese HTML text", () => {
+		const html = "<div><p class='yoast'>犬が大好き</p></div>";
+
+		const researcher = Factory.buildMockResearcher( {}, true, false, false,
+			{ splitIntoTokensCustom: splitIntoTokensCustom, memoizedTokenizer: memoizedSentenceTokenizer } );
+		const languageProcessor = new LanguageProcessor( researcher );
+		expect( build( html, languageProcessor ) ).toEqual( {
+			name: "#document-fragment",
+			attributes: {},
+			childNodes: [ {
+				name: "div",
+				sourceCodeLocation: {
+					startOffset: 0,
+					endOffset: 37,
+					startTag: {
+						startOffset: 0,
+						endOffset: 5,
+					},
+					endTag: {
+						startOffset: 31,
+						endOffset: 37,
+					},
+				},
+				attributes: {},
+				childNodes: [ {
+					name: "p",
+					isImplicit: false,
+					attributes: {
+						"class": new Set( [ "yoast" ] ),
+					},
+					sentences: [ {
+						text: "犬が大好き",
+						sourceCodeRange: { startOffset: 22, endOffset: 27 },
+						tokens: [
+							{ text: "犬", sourceCodeRange: { startOffset: 22, endOffset: 23 } },
+							{ text: "が", sourceCodeRange: { startOffset: 23, endOffset: 24 } },
+							{ text: "大好き", sourceCodeRange: { startOffset: 24, endOffset: 27 } },
+						],
+					} ],
+					childNodes: [ {
+						name: "#text",
+						value: "犬が大好き",
+					} ],
+					sourceCodeLocation: {
+						startOffset: 5,
+						endOffset: 31,
+						startTag: {
+							startOffset: 5,
+							endOffset: 22,
+						},
+						endTag: {
+							startOffset: 27,
+							endOffset: 31,
+						},
+					},
+				} ],
+			} ],
+		} );
+	} );
+
 	it( "adds implicit paragraphs around phrasing content outside of paragraphs and headings", () => {
 		const html = "<div>Hello <span>World!</span></div>";
 

diff --git a/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js b/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js
@@ -1,6 +1,7 @@
 import tokenize from "../../../../src/parse/build/private/tokenize";
 import Paper from "../../../../src/values/Paper";
 import EnglishResearcher from "../../../../src/languageProcessing/languages/en/Researcher";
+import JapaneseResearcher from "../../../../src/languageProcessing/languages/ja/Researcher";
 import { buildTreeNoTokenize } from "../../../specHelpers/parse/buildTree";
 import LanguageProcessor from "../../../../src/parse/language/LanguageProcessor";
 
@@ -192,3 +193,81 @@ describe( "A test for the tokenize function", function() {
 		} );
 	} );
 } );
+
+describe( "A test for tokenizing a japanese sentence", function() {
+	it( "should correctly tokenize a simple Japanse sentence.", function() {
+		const mockPaper = new Paper( "<p>犬が大好き\u3002</p>", { locale: "ja_JP" } );
+		const mockResearcher = new JapaneseResearcher( mockPaper );
+		const languageProcessor = new LanguageProcessor( mockResearcher );
+		buildTreeNoTokenize( mockPaper );
+		// eslint-disable-next-line max-len
+		expect( tokenize( mockPaper.getTree(), languageProcessor ) ).toEqual( {
+			attributes: {},
+			childNodes: [
+				{
+					attributes: {},
+					childNodes: [
+						{
+							name: "#text",
+							value: "犬が大好き。",
+						},
+					],
+					isImplicit: false,
+					name: "p",
+					sentences: [
+						{
+							sourceCodeRange: {
+								startOffset: 3,
+								endOffset: 9,
+							},
+							text: "犬が大好き。",
+							tokens: [
+								{
+									sourceCodeRange: {
+										startOffset: 3,
+										endOffset: 4,
+									},
+									text: "犬",
+								},
+								{
+									sourceCodeRange: {
+										startOffset: 4,
+										endOffset: 5,
+									},
+									text: "が",
+								},
+								{
+									sourceCodeRange: {
+										startOffset: 5,
+										endOffset: 8,
+									},
+									text: "大好き",
+								},
+								{
+									sourceCodeRange: {
+										startOffset: 8,
+										endOffset: 9,
+									},
+									text: "。",
+								},
+							],
+						},
+					],
+					sourceCodeLocation: {
+						startOffset: 0,
+						endOffset: 13,
+						startTag: {
+							startOffset: 0,
+							endOffset: 3,
+						},
+						endTag: {
+							startOffset: 9,
+							endOffset: 13,
+						},
+					},
+				},
+			],
+			name: "#document-fragment",
+		} );
+	} );
+} );
diff --git a/packages/yoastseo/spec/parse/language/LanguageProcessorSpec.js b/packages/yoastseo/spec/parse/language/LanguageProcessorSpec.js
@@ -2,6 +2,7 @@ import LanguageProcessor from "../../../src/parse/language/LanguageProcessor";
 import Factory from "../../specHelpers/factory";
 import memoizedSentenceTokenizer from "../../../src/languageProcessing/helpers/sentence/memoizedSentenceTokenizer";
 import Sentence from "../../../src/parse/structure/Sentence";
+import splitIntoTokensCustom from "../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom";
 
 const researcher = Factory.buildMockResearcher( {}, true, false, false,
 	{ memoizedTokenizer: memoizedSentenceTokenizer } );
@@ -432,3 +433,23 @@ describe.each( splitIntoTokensTestCases )( "A test for the tokenize method", ( {
 		expect( tokens ).toEqual( expectedTokens );
 	} );
 } );
+
+describe( "A test for the splitIntoTokens method in Japanese", () => {
+	it( "should return an array of tokens", function() {
+		const japaneseResearcher = Factory.buildMockResearcher( {}, true, false, false,
+			{ splitIntoTokensCustom: splitIntoTokensCustom } );
+		const languageProcessor = new LanguageProcessor( japaneseResearcher );
+		const tokens = languageProcessor.splitIntoTokens( new Sentence( "ウクライナは、東ヨーロッパに位置する国家。" ) );
+		expect( tokens ).toEqual( [
+			{ text: "ウクライナ", sourceCodeRange: {} },
+			{ text: "は", sourceCodeRange: {} },
+			{ text: "、", sourceCodeRange: {} },
+			{ text: "東ヨーロッパ", sourceCodeRange: {} },
+			{ text: "に", sourceCodeRange: {} },
+			{ text: "位置", sourceCodeRange: {} },
+			{ text: "する", sourceCodeRange: {} },
+			{ text: "国家", sourceCodeRange: {} },
+			{ text: "。", sourceCodeRange: {} },
+		] );
+	} );
+} );
diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/Researcher.js b/packages/yoastseo/src/languageProcessing/languages/ja/Researcher.js
@@ -10,6 +10,7 @@ import customCountLength from "./helpers/countCharacters";
 import matchTransitionWordsHelper from "./helpers/matchTransitionWords";
 import getContentWords from "./helpers/getContentWords";
 import memoizedTokenizer from "./helpers/memoizedSentenceTokenizer";
+import splitIntoTokensCustom from "./helpers/splitIntoTokensCustom";
 
 // All config
 import firstWordExceptions from "./config/firstWordExceptions";
@@ -72,6 +73,7 @@ export default class Researcher extends AbstractResearcher {
 			customCountLength,
 			matchTransitionWordsHelper,
 			memoizedTokenizer,
+			splitIntoTokensCustom,
 		} );
 
 		Object.assign( this.defaultResearches, {

diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom.js
@@ -0,0 +1,22 @@
+import { map } from "lodash-es";
+import TinySegmenter from "tiny-segmenter";
+
+/**
+ * Split sentence into tokens.
+ *
+ * @param {Sentence} sentence The sentence to split.
+ *
+ * @returns {Token[]} The tokens.
+ */
+function splitIntoTokensCustom( sentence ) {
+	// Retrieve sentence from sentence class
+	const sentenceText = sentence.text;
+	// Return empty string if sentence is empty
+	if ( sentenceText === "" ) {
+		return [];
+	}
+	// Split sentences into words that are also tokens
+	const words = new TinySegmenter().segment( sentenceText );
+	return map( words );
+}
+export default splitIntoTokensCustom;
diff --git a/packages/yoastseo/src/parse/build/private/tokenize.js b/packages/yoastseo/src/parse/build/private/tokenize.js
@@ -6,12 +6,12 @@ import getTextElementPositions from "./getTextElementPositions";
  *
  * @param {Paragraph|Heading} node The paragraph or heading node to split into sentences.
  * @param {Sentence} sentence The sentence.
- * @param {function} splitIntoTokens The function to use to split the sentence into tokens.
+ * @param {function} LanguageProcessor The languageprocessor for the current language.
  *
  * @returns {Sentence} The sentence, with tokens.
  */
-function getTokens( node, sentence, splitIntoTokens ) {
-	sentence.tokens = splitIntoTokens( sentence );
+function getTokens( node, sentence, LanguageProcessor ) {
+	sentence.tokens = LanguageProcessor.splitIntoTokens( sentence );
 	sentence.tokens = getTextElementPositions( node, sentence.tokens, sentence.sourceCodeRange.startOffset );
 	return sentence;
 }
@@ -31,7 +31,7 @@ function getSentences( node, languageProcessor ) {
 	// Add position information to the sentences.
 	sentences = getTextElementPositions( node, sentences );
 	// Tokenize sentences into tokens.
-	return sentences.map( sentence => getTokens( node, sentence, languageProcessor.splitIntoTokens ) );
+	return sentences.map( sentence => getTokens( node, sentence, languageProcessor ) );
 }
 
 /**

diff --git a/packages/yoastseo/src/parse/language/LanguageProcessor.js b/packages/yoastseo/src/parse/language/LanguageProcessor.js
@@ -72,6 +72,13 @@ class LanguageProcessor {
 		// Retrieve sentence from sentence class
 		const sentenceText = sentence.text;
 
+		// If there is a custom getWords helper use its output for retrieving words/tokens.
+		const tokenTextsCustom = this.researcher.getHelper( "splitIntoTokensCustom" );
+		if ( tokenTextsCustom ) {
+			const tokensCustom = tokenTextsCustom( sentence );
+			return tokensCustom.map( tokenText => new Token( tokenText ) );
+		}
+
 		// Split the sentence string into tokens. Those tokens are unrefined as they may contain punctuation.
 		const rawTokens = sentenceText.split( wordSeparatorsRegex ).filter( x => x !== "" );