Skip to content

Commit

Permalink
Merge pull request #20381 from Yoast/158-tokenize-sentences-in-japanese
Browse files Browse the repository at this point in the history
158 tokenize sentences in japanese
  • Loading branch information
mykola committed Jun 13, 2023
2 parents ef27594 + 61c4e98 commit 200f811
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import splitIntoTokensCustom from "../../../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom";

const testcases = [
{
description: "should return an empty result sentence is empty",
sentence: { text: "", sourceCodeRange: { startOffset: 0, endOffset: 0 } },
expected: [],
},
{
description: "should correctly tokenize a simple Japanese sentence wouthout punctuations",
sentence: {
text: "犬が大好き",
sourceCodeRange: { startOffset: 0, endOffset: 5 },
},
expected: [ "犬", "が", "大好き" ],
},
{
description: "should correctly tokenize a Japanese sentence with japanese punctuations",
sentence: {
text: "犬が大好き\u3002",
sourceCodeRange: { startOffset: 0, endOffset: 6 },
},
expected: [ "犬", "が", "大好き", "。" ],

},
{
description: "should correctly tokenize a Japanese sentence with english punctuations",
sentence: {
text: "犬が大好き.",
sourceCodeRange: { startOffset: 0, endOffset: 6 },
},
expected: [ "犬", "が", "大好き", "." ],
},
{
description: "should correctly tokenize a Japanese sentence with quotation marks inside",
sentence: {
text: "犬「が」大好き\u3002",
sourceCodeRange: { startOffset: 0, endOffset: 8 },
},
expected: [ "犬", "「", "が", "」", "大好き", "。" ],
},
{
description: "should correctly tokenize a Japanese sentence with quotation marks around",
sentence: {
text: "『犬が大好き\u3002』",
sourceCodeRange: { startOffset: 0, endOffset: 8 },
},
expected: [ "『", "犬", "が", "大好き", "。", "』" ],
},
];

describe.each( testcases )( "splitIntoTokensCustom for Japanese: %p", ( { description, sentence, expected } ) => {
it( description, () => {
const tokens = splitIntoTokensCustom( sentence );
expect( tokens ).toEqual( expected );
} );
} );
61 changes: 61 additions & 0 deletions packages/yoastseo/spec/parse/build/buildSpec.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import build from "../../../src/parse/build/build";
import LanguageProcessor from "../../../src/parse/language/LanguageProcessor";
import Factory from "../../specHelpers/factory";
import memoizedSentenceTokenizer from "../../../src/languageProcessing/helpers/sentence/memoizedSentenceTokenizer";
import splitIntoTokensCustom from "../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom";

describe( "The parse function", () => {
it( "parses a basic HTML text", () => {
Expand Down Expand Up @@ -67,6 +68,66 @@ describe( "The parse function", () => {
} );
} );

it( "parses a basic Japanese HTML text", () => {
const html = "<div><p class='yoast'>犬が大好き</p></div>";

const researcher = Factory.buildMockResearcher( {}, true, false, false,
{ splitIntoTokensCustom: splitIntoTokensCustom, memoizedTokenizer: memoizedSentenceTokenizer } );
const languageProcessor = new LanguageProcessor( researcher );
expect( build( html, languageProcessor ) ).toEqual( {
name: "#document-fragment",
attributes: {},
childNodes: [ {
name: "div",
sourceCodeLocation: {
startOffset: 0,
endOffset: 37,
startTag: {
startOffset: 0,
endOffset: 5,
},
endTag: {
startOffset: 31,
endOffset: 37,
},
},
attributes: {},
childNodes: [ {
name: "p",
isImplicit: false,
attributes: {
"class": new Set( [ "yoast" ] ),
},
sentences: [ {
text: "犬が大好き",
sourceCodeRange: { startOffset: 22, endOffset: 27 },
tokens: [
{ text: "犬", sourceCodeRange: { startOffset: 22, endOffset: 23 } },
{ text: "が", sourceCodeRange: { startOffset: 23, endOffset: 24 } },
{ text: "大好き", sourceCodeRange: { startOffset: 24, endOffset: 27 } },
],
} ],
childNodes: [ {
name: "#text",
value: "犬が大好き",
} ],
sourceCodeLocation: {
startOffset: 5,
endOffset: 31,
startTag: {
startOffset: 5,
endOffset: 22,
},
endTag: {
startOffset: 27,
endOffset: 31,
},
},
} ],
} ],
} );
} );

it( "adds implicit paragraphs around phrasing content outside of paragraphs and headings", () => {
const html = "<div>Hello <span>World!</span></div>";

Expand Down
79 changes: 79 additions & 0 deletions packages/yoastseo/spec/parse/build/private/tokenizeSpec.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import tokenize from "../../../../src/parse/build/private/tokenize";
import Paper from "../../../../src/values/Paper";
import EnglishResearcher from "../../../../src/languageProcessing/languages/en/Researcher";
import JapaneseResearcher from "../../../../src/languageProcessing/languages/ja/Researcher";
import { buildTreeNoTokenize } from "../../../specHelpers/parse/buildTree";
import LanguageProcessor from "../../../../src/parse/language/LanguageProcessor";

Expand Down Expand Up @@ -192,3 +193,81 @@ describe( "A test for the tokenize function", function() {
} );
} );
} );

describe( "A test for tokenizing a japanese sentence", function() {
it( "should correctly tokenize a simple Japanse sentence.", function() {
const mockPaper = new Paper( "<p>犬が大好き\u3002</p>", { locale: "ja_JP" } );
const mockResearcher = new JapaneseResearcher( mockPaper );
const languageProcessor = new LanguageProcessor( mockResearcher );
buildTreeNoTokenize( mockPaper );
// eslint-disable-next-line max-len
expect( tokenize( mockPaper.getTree(), languageProcessor ) ).toEqual( {
attributes: {},
childNodes: [
{
attributes: {},
childNodes: [
{
name: "#text",
value: "犬が大好き。",
},
],
isImplicit: false,
name: "p",
sentences: [
{
sourceCodeRange: {
startOffset: 3,
endOffset: 9,
},
text: "犬が大好き。",
tokens: [
{
sourceCodeRange: {
startOffset: 3,
endOffset: 4,
},
text: "犬",
},
{
sourceCodeRange: {
startOffset: 4,
endOffset: 5,
},
text: "が",
},
{
sourceCodeRange: {
startOffset: 5,
endOffset: 8,
},
text: "大好き",
},
{
sourceCodeRange: {
startOffset: 8,
endOffset: 9,
},
text: "。",
},
],
},
],
sourceCodeLocation: {
startOffset: 0,
endOffset: 13,
startTag: {
startOffset: 0,
endOffset: 3,
},
endTag: {
startOffset: 9,
endOffset: 13,
},
},
},
],
name: "#document-fragment",
} );
} );
} );
21 changes: 21 additions & 0 deletions packages/yoastseo/spec/parse/language/LanguageProcessorSpec.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import LanguageProcessor from "../../../src/parse/language/LanguageProcessor";
import Factory from "../../specHelpers/factory";
import memoizedSentenceTokenizer from "../../../src/languageProcessing/helpers/sentence/memoizedSentenceTokenizer";
import Sentence from "../../../src/parse/structure/Sentence";
import splitIntoTokensCustom from "../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom";

const researcher = Factory.buildMockResearcher( {}, true, false, false,
{ memoizedTokenizer: memoizedSentenceTokenizer } );
Expand Down Expand Up @@ -432,3 +433,23 @@ describe.each( splitIntoTokensTestCases )( "A test for the tokenize method", ( {
expect( tokens ).toEqual( expectedTokens );
} );
} );

describe( "A test for the splitIntoTokens method in Japanese", () => {
it( "should return an array of tokens", function() {
const japaneseResearcher = Factory.buildMockResearcher( {}, true, false, false,
{ splitIntoTokensCustom: splitIntoTokensCustom } );
const languageProcessor = new LanguageProcessor( japaneseResearcher );
const tokens = languageProcessor.splitIntoTokens( new Sentence( "ウクライナは、東ヨーロッパに位置する国家。" ) );
expect( tokens ).toEqual( [
{ text: "ウクライナ", sourceCodeRange: {} },
{ text: "は", sourceCodeRange: {} },
{ text: "、", sourceCodeRange: {} },
{ text: "東ヨーロッパ", sourceCodeRange: {} },
{ text: "に", sourceCodeRange: {} },
{ text: "位置", sourceCodeRange: {} },
{ text: "する", sourceCodeRange: {} },
{ text: "国家", sourceCodeRange: {} },
{ text: "。", sourceCodeRange: {} },
] );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import customCountLength from "./helpers/countCharacters";
import matchTransitionWordsHelper from "./helpers/matchTransitionWords";
import getContentWords from "./helpers/getContentWords";
import memoizedTokenizer from "./helpers/memoizedSentenceTokenizer";
import splitIntoTokensCustom from "./helpers/splitIntoTokensCustom";

// All config
import firstWordExceptions from "./config/firstWordExceptions";
Expand Down Expand Up @@ -72,6 +73,7 @@ export default class Researcher extends AbstractResearcher {
customCountLength,
matchTransitionWordsHelper,
memoizedTokenizer,
splitIntoTokensCustom,
} );

Object.assign( this.defaultResearches, {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { map } from "lodash-es";
import TinySegmenter from "tiny-segmenter";

/**
* Split sentence into tokens.
*
* @param {Sentence} sentence The sentence to split.
*
* @returns {Token[]} The tokens.
*/
function splitIntoTokensCustom( sentence ) {
// Retrieve sentence from sentence class
const sentenceText = sentence.text;
// Return empty string if sentence is empty
if ( sentenceText === "" ) {
return [];
}
// Split sentences into words that are also tokens
const words = new TinySegmenter().segment( sentenceText );
return map( words );
}
export default splitIntoTokensCustom;
8 changes: 4 additions & 4 deletions packages/yoastseo/src/parse/build/private/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ import getTextElementPositions from "./getTextElementPositions";
*
* @param {Paragraph|Heading} node The paragraph or heading node to split into sentences.
* @param {Sentence} sentence The sentence.
* @param {function} splitIntoTokens The function to use to split the sentence into tokens.
* @param {function} LanguageProcessor The languageprocessor for the current language.
*
* @returns {Sentence} The sentence, with tokens.
*/
function getTokens( node, sentence, splitIntoTokens ) {
sentence.tokens = splitIntoTokens( sentence );
function getTokens( node, sentence, LanguageProcessor ) {
sentence.tokens = LanguageProcessor.splitIntoTokens( sentence );
sentence.tokens = getTextElementPositions( node, sentence.tokens, sentence.sourceCodeRange.startOffset );
return sentence;
}
Expand All @@ -31,7 +31,7 @@ function getSentences( node, languageProcessor ) {
// Add position information to the sentences.
sentences = getTextElementPositions( node, sentences );
// Tokenize sentences into tokens.
return sentences.map( sentence => getTokens( node, sentence, languageProcessor.splitIntoTokens ) );
return sentences.map( sentence => getTokens( node, sentence, languageProcessor ) );
}

/**
Expand Down
7 changes: 7 additions & 0 deletions packages/yoastseo/src/parse/language/LanguageProcessor.js
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ class LanguageProcessor {
// Retrieve sentence from sentence class
const sentenceText = sentence.text;

// If there is a custom getWords helper use its output for retrieving words/tokens.
const tokenTextsCustom = this.researcher.getHelper( "splitIntoTokensCustom" );
if ( tokenTextsCustom ) {
const tokensCustom = tokenTextsCustom( sentence );
return tokensCustom.map( tokenText => new Token( tokenText ) );
}

// Split the sentence string into tokens. Those tokens are unrefined as they may contain punctuation.
const rawTokens = sentenceText.split( wordSeparatorsRegex ).filter( x => x !== "" );

Expand Down

0 comments on commit 200f811

Please sign in to comment.