javascript - Wrapping Sentences within <p> Tags with <span>'s, But Keep Other Tags -
to give idea of need, have been using below code parse content within
tags , wrap each sentence within tags can interact sentences on page.
$('p').each(function() { var sentences = $(this) .text() .replace(/(((?![.!?]['"]?\s).)*[.!?]['"]?)(\s|$)/g, '<span class="sentence">$1</span>$3'); $(this).html(sentences); });
however, following line demonstrates problem:
<p>this <a href="#">link</a> , removed above code! here sentence.</p>
nested tags such <a>, <img> etc...within <p> tags i'm searching through removed code i'm using. need keep these tags intact, content stays same within <p> tags.
i need:
<p><span class="sentence">this <a href="#">link</a> , removed above code!</sentence><sentence>here sentence.</sentence></p>
after reading this barn-burner parsing html regex, i've concluded need use combo of html parser of sort traverse through sub-tags within <p> tag, , use regex find sentences. think regex have listed above should work of uses, if helps.
so: how should it?
it difficult tokenise language, reliably, sentences , without added complexity of throwing html equation. there applications etc out there attempt deal natural language processing, example stanford tokenizer runs on java (not javascript)
and people keep mentioning, regex not solution problem, language not regular don't expect regular expression solution.
there question here on so, basic nlp in coffeescript or javascript — punkt tokenizaton, simple trained bayes models — start? think summarises things javascript.
anyway, @ least give little play with, knocked little code you. works reasonable until markup/language begins resemble complex or different, fails mark long way. but, may enough need, don't know.
css
.emphasis { font-style: italic; } .bold { font-weight: bold; } .emphasis.bold { font-style: italic; font-weight: bold; } .unidentified { background-color: pink; } .sentence0 { background-color: yellow; } .sentence1 { background-color: green; } .sentence2 { background-color: red; } .whitespace { white-space: pre; background-color: blue; }
javascript
/*jslint maxerr: 50, indent: 4, browser: true */ (function () { "use strict"; var rxopen = new regexp("<[^\\/].+?>"), rxclose = new regexp("<\\/.+?>"), rxwhitespace = new regexp("^\\s+?"), rxsupstart = new regexp("^<sup\\b[^>]*>"), rxsupend = new regexp("<\/sup>"), sentenceend = [], color = 0, rxindex; sentenceend.push(new regexp("[^\\d][\\.!\\?]+")); sentenceend.push(new regexp("(?=([^\\\"]*\\\"[^\\\"]*\\\")*[^\\\"]*?$)")); sentenceend.push(new regexp("(?![^\\(]*?\\))")); sentenceend.push(new regexp("(?![^\\[]*?\\])")); sentenceend.push(new regexp("(?![^\\{]*?\\})")); sentenceend.push(new regexp("(?![^\\|]*?\\|)")); //sentenceend.push(new regexp("(?![^\\\\]*?\\\\)")); //sentenceend.push(new regexp("(?![^\\/.]*\\/)")); // problem, 1 problematic rxindex = new regexp(sentenceend.reduce(function (previousvalue, currentvalue) { return previousvalue + currentvalue.source; }, "")); function indexsentenceend(html) { var index = html.search(rxindex); if (index !== -1) { index += html.match(rxindex)[0].length - 1; } return index; } function pushspan(array, classname, string, classnameopt) { if (classname === "sentence") { classname += color % 2; if (classnameopt) { classname += " " + classnameopt; } color += 1; } array.push('<span class="' + classname + '">' + string + '</span>'); } function addsuptoprevious(html, array) { var sup = html.search(rxsupstart), end = 0, last; if (sup !== -1) { end = html.search(rxsupend); if (end !== -1) { last = array.pop(); end = end + 6; array.push(last.slice(0, -7) + html.slice(0, end) + last.slice(-7)); } } return html.slice(end); } function leadingwhitespaces(html, array) { var whitespace = html.search(rxwhitespace), count = 0; if (whitespace !== -1) { count = html.match(rxwhitespace)[0].length; pushspan(array, "whitespace", html.slice(0, count)); } return html.slice(count); } function paragraphissentence(html, array) { var index = indexsentenceend(html); if (index === -1 || index === html.length) { pushspan(array, "sentence", html, "paragraphissentence"); html = ""; } return html; } function paragraphnomarkup(html, array) { var open = html.search(rxopen), index = 0; if (open === -1) { index = indexsentenceend(html); if (index === -1) { index = html.length; } pushspan(array, "sentence", html.slice(0, index += 1), "paragraphnomarkup"); } return html.slice(index); } function sentenceuncontained(html, array) { var open = html.search(rxopen), index = 0, close; if (open !== -1) { index = indexsentenceend(html); if (index === -1) { index = html.length; } close = html.search(rxclose); if (index < open || index > close) { pushspan(array, "sentence", html.slice(0, index += 1), "sentenceuncontained"); } else { index = 0; } } return html.slice(index); } function sentencecontained(html, array) { var open = html.search(rxopen), index = 0, close, count; if (open !== -1) { index = indexsentenceend(html); if (index === -1) { index = html.length; } close = html.search(rxclose); if (index > open && index < close) { count = html.match(rxclose)[0].length; pushspan(array, "sentence", html.slice(0, close + count), "sentencecontained"); index = close + count; } else { index = 0; } } return html.slice(index); } function anythingelse(html, array) { pushspan(array, "sentence2", html, "anythingelse"); return ""; } function guesssenetences() { var paragraphs = document.getelementsbytagname("p"); array.prototype.foreach.call(paragraphs, function (paragraph) { var html = paragraph.innerhtml, length = html.length, array = [], safety = 100; while (length && safety) { html = addsuptoprevious(html, array); if (html.length === length) { html = leadingwhitespaces(html, array); if (html.length === length) { html = paragraphissentence(html, array); if (html.length === length) { html = paragraphnomarkup(html, array); if (html.length === length) { html = sentenceuncontained(html, array); if (html.length === length) { html = sentencecontained(html, array); if (html.length === length) { html = anythingelse(html, array); } } } } } } length = html.length; safety -= 1; } paragraph.innerhtml = array.join(""); }); } guesssenetences(); }());
on jsfiddle
Comments
Post a Comment