feat: track emphasized text in partition_html (#1034)

* Feat/965 track emphasized text html (#1021)

* feat: add functionality to track emphasized text (<strong>, <em>, <span>, <b>, <i> tags) in HTML

* feat: add `include_tail_text` parameter to `_construct_text`

* test: add test case for `_get_emphasized_texts_from_tag`

* test: add `emphasized_texts` to metadata

* chore: update changelog & version

* fix tests

* fix lint errors

* chore: update changelog

* chore: small comment updates

* feat: update `XMLDocument._read_xml` to create `<p>` tag element for the text enclosed in the `<pre>` tag

* chore: update changelog

* Update ingest test fixtures (#1026)

Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: Matt Robinson <mrobinson@unstructured.io>

* ingest-test-fixtures-update

* Update ingest test fixtures (#1035)

Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>

---------

Co-authored-by: Christine Straub <christinemstraub@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
This commit is contained in:
Matt Robinson 2023-08-03 12:24:25 -04:00 committed by GitHub
parent 73eeae852e
commit f4ddf53590
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 1539 additions and 151 deletions

View File

@ -3,6 +3,9 @@
* Adds post processing function `filter_element_types`
* Set the default strategy for partitioning images to `hi_res`
* Add page break parameter section in API documentation to sync with change in Prod API
* Update `XMLDocument._read_xml` to create `<p>` tag element for the text enclosed in the `<pre>` tag
* Track emphasized texts in `partition_html` output
* Add parameter `include_tail_text` to `_construct_text` to enable (skip) tail text inclusion
### Features

View File

@ -139,6 +139,44 @@ def test_construct_text(doc, expected):
assert text == expected
@pytest.mark.parametrize(
("doc", "root", "expected"),
[
(
"<p>Hello <strong>there</strong> I <em>am</em> a <b>very</b> <i>important</i> text</p>",
"p",
[
{"text": "there", "tag": "strong"},
{"text": "am", "tag": "em"},
{"text": "very", "tag": "b"},
{"text": "important", "tag": "i"},
],
),
(
"<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>",
"p",
[
{"text": "list", "tag": "span"},
{"text": "my favorite things", "tag": "b"},
{"text": "favorite", "tag": "i"},
],
),
(
"<strong>A lone strong text!</strong>",
"strong",
[{"text": "A lone strong text!", "tag": "strong"}],
),
("<span>I have a</span> tail", "span", [{"text": "I have a", "tag": "span"}]),
("<span>Empty result</span> ", "p", []),
],
)
def test_get_emphasized_texts_from_tag(doc, expected, root):
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(f".//{root}")
emphasized_texts = html._get_emphasized_texts_from_tag(el)
assert emphasized_texts == expected
def test_parse_nothing():
doc = """<p></p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())

View File

@ -480,3 +480,41 @@ def test_partition_html_grabs_links():
"url": "/loner",
},
]
def test_partition_html_grabs_emphasized_texts():
html_text = """<html>
<p>Hello there I am a very <strong>important</strong> text!</p>
<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>
<ul>
<li><em>Parrots</em></li>
<li>Dogs</li>
</ul>
<span>A lone span text!</span>
</html>"""
elements = partition_html(text=html_text)
assert elements[0] == NarrativeText("Hello there I am a very important text!")
assert elements[0].metadata.emphasized_texts == [
{"text": "important", "tag": "strong"},
]
assert elements[1] == NarrativeText("Here is a list of my favorite things")
assert elements[1].metadata.emphasized_texts == [
{"text": "list", "tag": "span"},
{"text": "my favorite things", "tag": "b"},
{"text": "favorite", "tag": "i"},
]
assert elements[2] == ListItem("Parrots")
assert elements[2].metadata.emphasized_texts == [
{"text": "Parrots", "tag": "em"},
]
assert elements[3] == ListItem("Dogs")
assert elements[3].metadata.emphasized_texts is None
assert elements[4] == Title("A lone span text!")
assert elements[4].metadata.emphasized_texts == [
{"text": "A lone span text!", "tag": "span"},
]

View File

@ -499,7 +499,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.",
"tag": "strong"
}
]
},
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
},
@ -515,6 +521,12 @@
"text": "Spring Safety website",
"url": "https://www.weather.gov/wrn/spring-safety"
}
],
"emphasized_texts": [
{
"text": "Second, encourage others to become Weather-Ready as well.",
"tag": "strong"
}
]
},
"text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic."

View File

@ -15,6 +15,12 @@
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -15,6 +15,12 @@
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -5,7 +5,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Driver",
"tag": "strong"
}
]
},
"text": "Driver"
},
@ -15,7 +21,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Approver",
"tag": "strong"
}
]
},
"text": "Approver"
},
@ -25,7 +37,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Contributors",
"tag": "strong"
}
]
},
"text": "Contributors"
},
@ -35,7 +53,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Informed",
"tag": "strong"
}
]
},
"text": "Informed"
},
@ -45,7 +69,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Objective",
"tag": "strong"
}
]
},
"text": "Objective"
},
@ -55,7 +85,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Due date",
"tag": "strong"
}
]
},
"text": "Due date"
},
@ -65,7 +101,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Key outcomes",
"tag": "strong"
}
]
},
"text": "Key outcomes"
},
@ -75,7 +117,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Status",
"tag": "strong"
}
]
},
"text": "Status"
},
@ -85,7 +133,21 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "NOT STARTED",
"tag": "span"
},
{
"text": "IN PROGRESS",
"tag": "span"
},
{
"text": "COMPLETE",
"tag": "span"
}
]
},
"text": "NOT STARTED / IN PROGRESS / COMPLETE"
},
@ -115,7 +177,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Must have:",
"tag": "strong"
}
]
},
"text": "Must have:"
},
@ -145,7 +213,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Nice to have:",
"tag": "strong"
}
]
},
"text": "Nice to have:"
},
@ -175,7 +249,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Not in scope:",
"tag": "strong"
}
]
},
"text": "Not in scope:"
},
@ -305,7 +385,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Milestone",
"tag": "strong"
}
]
},
"text": "Milestone"
},
@ -315,7 +401,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Owner",
"tag": "strong"
}
]
},
"text": "Owner"
},
@ -325,7 +417,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Deadline",
"tag": "strong"
}
]
},
"text": "Deadline"
},
@ -335,7 +433,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Status",
"tag": "strong"
}
]
},
"text": "Status"
},

View File

@ -75,7 +75,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Time",
"tag": "strong"
}
]
},
"text": "Time"
},
@ -85,7 +91,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Item",
"tag": "strong"
}
]
},
"text": "Item"
},
@ -95,7 +107,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Presenter",
"tag": "strong"
}
]
},
"text": "Presenter"
},
@ -105,7 +123,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Notes",
"tag": "strong"
}
]
},
"text": "Notes"
},

View File

@ -15,7 +15,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Win",
"tag": "strong"
}
]
},
"text": "Win"
},
@ -55,7 +61,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Needs input",
"tag": "strong"
}
]
},
"text": "Needs input"
},
@ -95,7 +107,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Focus",
"tag": "strong"
}
]
},
"text": "Focus"
},
@ -125,7 +143,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Notes",
"tag": "strong"
}
]
},
"text": "Notes"
},
@ -135,7 +159,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Important Links",
"tag": "strong"
}
]
},
"text": "Important Links"
}

View File

@ -25,7 +25,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Add a header image.",
"tag": "strong"
}
]
},
"text": "Add a header image. This gives your overview visual appeal and makes it welcoming for visitors."
},
@ -35,7 +41,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Explain what the space is for.",
"tag": "strong"
}
]
},
"text": "Explain what the space is for. Start by summarizing the purpose of the space. This could be your team's mission statement or a brief description of the kind of work you do."
},
@ -59,6 +71,12 @@
"text": "product roadmaps",
"url": "https://www.atlassian.com/software/confluence/templates/product-roadmap"
}
],
"emphasized_texts": [
{
"text": "Share team goals.",
"tag": "strong"
}
]
},
"text": "Share team goals. Add links to your team's OKRs, project plans, and product roadmaps so visitors can quickly get a sense of your team's goals."
@ -69,7 +87,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Tell people how to contact you.",
"tag": "strong"
}
]
},
"text": "Tell people how to contact you. Share your timezone and links to Slack channels, email aliases, or other contact details your team uses so visitors can contact you with questions or feedback about your team's work."
},
@ -221,7 +245,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "To add functionality:",
"tag": "strong"
}
]
},
"text": "To add functionality:"
},
@ -251,7 +281,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Insert",
"tag": "strong"
}
]
},
"text": "Select Insert"
},
@ -271,7 +307,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Introduce the team",
"tag": "strong"
}
]
},
"text": "Introduce the team"
},
@ -297,7 +339,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Share news and announcements with your team",
"tag": "strong"
}
]
},
"text": "Share news and announcements with your team"
},
@ -323,7 +371,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Display a list of important pages",
"tag": "strong"
}
]
},
"text": "Display a list of important pages"
},

View File

@ -75,7 +75,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "What is your team all about?",
"tag": "em"
},
{
"text": "What is your team all about?",
"tag": "span"
}
]
},
"text": "What is your team all about?"
},
@ -95,7 +105,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "What is your team's mission? What is your vision?",
"tag": "em"
},
{
"text": "What is your team's mission? What is your vision?",
"tag": "span"
}
]
},
"text": "What is your team's mission? What is your vision?"
},
@ -115,7 +135,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Add team members to your space.",
"tag": "em"
},
{
"text": "Add team members to your space.",
"tag": "span"
}
]
},
"text": "Add team members to your space."
},
@ -125,7 +155,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Team member",
"tag": "span"
}
]
},
"text": "Team member"
},
@ -135,7 +171,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Role",
"tag": "span"
}
]
},
"text": "Role"
},
@ -145,7 +187,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Responsibility",
"tag": "span"
}
]
},
"text": "Responsibility"
},
@ -155,7 +203,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Team member",
"tag": "span"
}
]
},
"text": "Team member"
},
@ -165,7 +219,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Role",
"tag": "span"
}
]
},
"text": "Role"
},
@ -175,7 +235,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Responsibility",
"tag": "span"
}
]
},
"text": "Responsibility"
},
@ -185,7 +251,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Team member",
"tag": "span"
}
]
},
"text": "Team member"
},
@ -195,7 +267,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Role",
"tag": "span"
}
]
},
"text": "Role"
},
@ -205,7 +283,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Responsibility",
"tag": "span"
}
]
},
"text": "Responsibility"
},
@ -225,7 +309,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "How can someone reach out to your team?",
"tag": "em"
},
{
"text": "How can someone reach out to your team?",
"tag": "span"
}
]
},
"text": "How can someone reach out to your team?"
},
@ -241,6 +335,12 @@
"text": null,
"url": "mailto:team@email.com"
}
],
"emphasized_texts": [
{
"text": "team@email.com",
"tag": "span"
}
]
},
"text": "team@email.com"
@ -251,7 +351,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Tickets",
"tag": "span"
}
]
},
"text": "Tickets"
},
@ -261,7 +367,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Jira board",
"tag": "span"
}
]
},
"text": "Jira board"
},
@ -271,7 +383,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "#channel",
"tag": "span"
}
]
},
"text": "#channel"
},
@ -291,7 +409,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "List them here",
"tag": "em"
},
{
"text": "List them here",
"tag": "span"
}
]
},
"text": "List them here"
},
@ -341,7 +469,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Add resources for new hires",
"tag": "em"
},
{
"text": "Add resources for new hires",
"tag": "span"
}
]
},
"text": "Add resources for new hires"
},
@ -361,7 +499,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Add links to meeting notes",
"tag": "em"
},
{
"text": "Add links to meeting notes",
"tag": "span"
}
]
},
"text": "Add links to meeting notes"
},
@ -381,7 +529,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "List them here",
"tag": "em"
},
{
"text": "List them here",
"tag": "span"
}
]
},
"text": "List them here"
},
@ -401,7 +559,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Create a blog post to share team news. It will automatically appear here once it's published.",
"tag": "em"
},
{
"text": "Create a blog post to share team news. It will automatically appear here once it's published.",
"tag": "span"
}
]
},
"text": "Create a blog post to share team news. It will automatically appear here once it's published."
},

View File

@ -75,7 +75,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "What is your team all about?",
"tag": "em"
},
{
"text": "What is your team all about?",
"tag": "span"
}
]
},
"text": "What is your team all about?"
},
@ -95,7 +105,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "What is your team's mission? What is your vision?",
"tag": "em"
},
{
"text": "What is your team's mission? What is your vision?",
"tag": "span"
}
]
},
"text": "What is your team's mission? What is your vision?"
},
@ -115,7 +135,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Add team members to your space.",
"tag": "em"
},
{
"text": "Add team members to your space.",
"tag": "span"
}
]
},
"text": "Add team members to your space."
},
@ -125,7 +155,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Team member",
"tag": "span"
}
]
},
"text": "Team member"
},
@ -135,7 +171,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Role",
"tag": "span"
}
]
},
"text": "Role"
},
@ -145,7 +187,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Responsibility",
"tag": "span"
}
]
},
"text": "Responsibility"
},
@ -155,7 +203,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Team member",
"tag": "span"
}
]
},
"text": "Team member"
},
@ -165,7 +219,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Role",
"tag": "span"
}
]
},
"text": "Role"
},
@ -175,7 +235,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Responsibility",
"tag": "span"
}
]
},
"text": "Responsibility"
},
@ -185,7 +251,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Team member",
"tag": "span"
}
]
},
"text": "Team member"
},
@ -195,7 +267,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Role",
"tag": "span"
}
]
},
"text": "Role"
},
@ -205,7 +283,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Responsibility",
"tag": "span"
}
]
},
"text": "Responsibility"
},
@ -225,7 +309,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "How can someone reach out to your team?",
"tag": "em"
},
{
"text": "How can someone reach out to your team?",
"tag": "span"
}
]
},
"text": "How can someone reach out to your team?"
},
@ -241,6 +335,12 @@
"text": null,
"url": "mailto:team@email.com"
}
],
"emphasized_texts": [
{
"text": "team@email.com",
"tag": "span"
}
]
},
"text": "team@email.com"
@ -251,7 +351,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Tickets",
"tag": "span"
}
]
},
"text": "Tickets"
},
@ -261,7 +367,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Jira board",
"tag": "span"
}
]
},
"text": "Jira board"
},
@ -271,7 +383,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "#channel",
"tag": "span"
}
]
},
"text": "#channel"
},
@ -291,7 +409,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "List them here",
"tag": "em"
},
{
"text": "List them here",
"tag": "span"
}
]
},
"text": "List them here"
},
@ -341,7 +469,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Add resources for new hires",
"tag": "em"
},
{
"text": "Add resources for new hires",
"tag": "span"
}
]
},
"text": "Add resources for new hires"
},
@ -361,7 +499,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Add links to meeting notes",
"tag": "em"
},
{
"text": "Add links to meeting notes",
"tag": "span"
}
]
},
"text": "Add links to meeting notes"
},
@ -381,7 +529,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "List them here",
"tag": "em"
},
{
"text": "List them here",
"tag": "span"
}
]
},
"text": "List them here"
},
@ -401,7 +559,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Create a blog post to share team news. It will automatically appear here once it's published.",
"tag": "em"
},
{
"text": "Create a blog post to share team news. It will automatically appear here once it's published.",
"tag": "span"
}
]
},
"text": "Create a blog post to share team news. It will automatically appear here once it's published."
},

View File

@ -211,7 +211,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Checklist Item 1",
"tag": "span"
}
]
},
"text": "Testdoc3 Checklist Item 1"
},
@ -221,7 +227,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Checklist Item 2 (checked)",
"tag": "span"
}
]
},
"text": "Testdoc3 Checklist Item 2 (checked)"
},
@ -231,7 +243,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Checklist Item 3",
"tag": "span"
}
]
},
"text": "Testdoc3 Checklist Item 3"
},
@ -251,7 +269,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 bold text",
"tag": "strong"
}
]
},
"text": "Testdoc3 bold text"
},
@ -261,7 +285,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 italic text",
"tag": "em"
}
]
},
"text": "Testdoc3 italic text"
},
@ -321,7 +351,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 1 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 1 Row 0"
},
@ -331,7 +367,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 2 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 2 Row 0"
},
@ -341,7 +383,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 3 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 3 Row 0"
},
@ -351,7 +399,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 1 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 1 Row 1"
},
@ -361,7 +415,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 2 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 2 Row 1"
},
@ -371,7 +431,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 3 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 3 Row 1"
},
@ -381,7 +447,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 1 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 1 Row 2"
},
@ -391,7 +463,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 2 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 2 Row 2"
},
@ -401,7 +479,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc3 Table: Column 3 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc3 Table: Column 3 Row 2"
}

View File

@ -211,7 +211,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Checklist Item 1",
"tag": "span"
}
]
},
"text": "Testdoc2 Checklist Item 1"
},
@ -221,7 +227,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Checklist Item 2 (checked)",
"tag": "span"
}
]
},
"text": "Testdoc2 Checklist Item 2 (checked)"
},
@ -231,7 +243,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Checklist Item 3",
"tag": "span"
}
]
},
"text": "Testdoc2 Checklist Item 3"
},
@ -251,7 +269,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 bold text",
"tag": "strong"
}
]
},
"text": "Testdoc2 bold text"
},
@ -261,7 +285,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 italic text",
"tag": "em"
}
]
},
"text": "Testdoc2 italic text"
},
@ -321,7 +351,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 1 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 1 Row 0"
},
@ -331,7 +367,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 2 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 2 Row 0"
},
@ -341,7 +383,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 3 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 3 Row 0"
},
@ -351,7 +399,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 1 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 1 Row 1"
},
@ -361,7 +415,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 2 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 2 Row 1"
},
@ -371,7 +431,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 3 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 3 Row 1"
},
@ -381,7 +447,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 1 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 1 Row 2"
},
@ -391,7 +463,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 2 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 2 Row 2"
},
@ -401,7 +479,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 3 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 3 Row 2"
}

View File

@ -5,7 +5,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Welcome to Confluence!",
"tag": "strong"
},
{
"text": "Welcome to Confluence!",
"tag": "span"
}
]
},
"text": "Welcome to Confluence!"
},
@ -35,7 +45,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "On this page",
"tag": "strong"
}
]
},
"text": "On this page"
},
@ -45,7 +61,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Confluence 101",
"tag": "strong"
},
{
"text": "Confluence 101",
"tag": "span"
}
]
},
"text": "Confluence 101"
},
@ -97,7 +123,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "TEAM SPACES",
"tag": "span"
}
]
},
"text": "TEAM SPACES"
},
@ -117,7 +149,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "PROJECT SPACES",
"tag": "span"
}
]
},
"text": "PROJECT SPACES"
},
@ -137,7 +175,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "PERSONAL SPACES",
"tag": "span"
}
]
},
"text": "PERSONAL SPACES"
},
@ -157,7 +201,37 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "In the main navigation bar, select Spaces→ Create space to create a new space.",
"tag": "span"
},
{
"text": "In the main navigation bar, select",
"tag": "strong"
},
{
"text": "In the main navigation bar, select",
"tag": "span"
},
{
"text": "→",
"tag": "strong"
},
{
"text": "→",
"tag": "span"
},
{
"text": "to create a new space.",
"tag": "strong"
},
{
"text": "to create a new space.",
"tag": "span"
}
]
},
"text": "In the main navigation bar, select Spaces→ Create space to create a new space."
},
@ -213,7 +287,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Press c to create a page at any time.",
"tag": "span"
},
{
"text": "Press",
"tag": "strong"
},
{
"text": "Press",
"tag": "span"
},
{
"text": "to create a page at any time.",
"tag": "strong"
},
{
"text": "to create a page at any time.",
"tag": "span"
}
]
},
"text": "Press c to create a page at any time."
},
@ -239,7 +335,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Visualize the relationship between spaces and pages",
"tag": "span"
}
]
},
"text": "Visualize the relationship between spaces and pages"
},
@ -265,7 +367,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 2
"page_number": 2,
"emphasized_texts": [
{
"text": "Making purposeful content",
"tag": "strong"
},
{
"text": "Making purposeful content",
"tag": "span"
}
]
},
"text": "Making purposeful content"
},
@ -315,7 +427,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 2
"page_number": 2,
"emphasized_texts": [
{
"text": "Press e when on a page to open the editor.",
"tag": "span"
},
{
"text": "Press",
"tag": "strong"
},
{
"text": "Press",
"tag": "span"
},
{
"text": "when on a page to open the editor.",
"tag": "strong"
},
{
"text": "when on a page to open the editor.",
"tag": "span"
}
]
},
"text": "Press e when on a page to open the editor."
},
@ -335,6 +469,12 @@
"text": "Browse space templates",
"url": "https://support.atlassian.com/confluence-cloud/docs/create-a-space-from-a-template"
}
],
"emphasized_texts": [
{
"text": "|",
"tag": "span"
}
]
},
"text": "Learn more about page templates | Browse space templates"
@ -365,7 +505,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 2
"page_number": 2,
"emphasized_texts": [
{
"text": "Press/to see a list of elements.",
"tag": "span"
},
{
"text": "Press",
"tag": "strong"
},
{
"text": "Press",
"tag": "span"
},
{
"text": "to see a list of elements.",
"tag": "strong"
},
{
"text": "to see a list of elements.",
"tag": "span"
}
]
},
"text": "Press/to see a list of elements."
},
@ -411,7 +573,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 2
"page_number": 2,
"emphasized_texts": [
{
"text": "Select Add status at the top of the page, so teammates can see your progress at a glance.",
"tag": "span"
},
{
"text": "Select",
"tag": "strong"
},
{
"text": "Select",
"tag": "span"
},
{
"text": "at the top of the page, so teammates can see your progress at a glance.",
"tag": "strong"
},
{
"text": "at the top of the page, so teammates can see your progress at a glance.",
"tag": "span"
}
]
},
"text": "Select Add status at the top of the page, so teammates can see your progress at a glance."
},
@ -453,7 +637,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
"page_number": 3,
"emphasized_texts": [
{
"text": "Collaborating with teammates",
"tag": "strong"
},
{
"text": "Collaborating with teammates",
"tag": "span"
}
]
},
"text": "Collaborating with teammates"
},
@ -473,7 +667,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
"page_number": 3,
"emphasized_texts": [
{
"text": "Edit",
"tag": "strong"
}
]
},
"text": "You and up to 11 teammates can edit a page together in real time. Changes save and sync automatically so that everyone editing sees the same thing. Start editing together by selecting the Edit icon in the header."
},
@ -483,7 +683,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
"page_number": 3,
"emphasized_texts": [
{
"text": "Press Command + Enter when in the editor to publish.",
"tag": "span"
},
{
"text": "Press",
"tag": "strong"
},
{
"text": "Press",
"tag": "span"
},
{
"text": "when in the editor to publish.",
"tag": "strong"
},
{
"text": "when in the editor to publish.",
"tag": "span"
}
]
},
"text": "Press Command + Enter when in the editor to publish."
},
@ -539,7 +761,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
"page_number": 3,
"emphasized_texts": [
{
"text": "Leave comments for your team in different places, depending on the type of feedback you have:",
"tag": "span"
}
]
},
"text": "Leave comments for your team in different places, depending on the type of feedback you have:"
},
@ -549,7 +777,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
"page_number": 3,
"emphasized_texts": [
{
"text": "Inline comment →Added to a highlighted section of the page or post you are editing or viewing; good for targeted comments on a specific word or phrase.",
"tag": "span"
},
{
"text": "Inline comment →",
"tag": "strong"
}
]
},
"text": "Inline comment →Added to a highlighted section of the page or post you are editing or viewing; good for targeted comments on a specific word or phrase."
},
@ -559,7 +797,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
"page_number": 3,
"emphasized_texts": [
{
"text": "Page comment →Added below the content of the page or post you are viewing; good for comments that apply to the content as a whole.",
"tag": "span"
},
{
"text": "Page comment →",
"tag": "strong"
}
]
},
"text": "Page comment →Added below the content of the page or post you are viewing; good for comments that apply to the content as a whole."
},
@ -569,7 +817,21 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
"page_number": 3,
"emphasized_texts": [
{
"text": "Leave an inline comment when viewing a page or in the editor.",
"tag": "span"
},
{
"text": "Leave an inline comment when viewing a page or in the editor.",
"tag": "strong"
},
{
"text": "Leave an inline comment when viewing a page or in the editor.",
"tag": "span"
}
]
},
"text": "Leave an inline comment when viewing a page or in the editor."
},
@ -611,7 +873,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
"page_number": 4,
"emphasized_texts": [
{
"text": "Organizing your content",
"tag": "strong"
},
{
"text": "Organizing your content",
"tag": "span"
}
]
},
"text": "Organizing your content"
},
@ -677,7 +949,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
"page_number": 4,
"emphasized_texts": [
{
"text": "Use the /info panel element to highlight important information in your space overview.",
"tag": "span"
},
{
"text": "Use the",
"tag": "strong"
},
{
"text": "Use the",
"tag": "span"
},
{
"text": "panel element to highlight important information in your space overview.",
"tag": "strong"
},
{
"text": "panel element to highlight important information in your space overview.",
"tag": "span"
}
]
},
"text": "Use the /info panel element to highlight important information in your space overview."
},
@ -713,7 +1007,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
"page_number": 4,
"emphasized_texts": [
{
"text": "BLOG POST",
"tag": "span"
}
]
},
"text": "BLOG POST"
},
@ -733,7 +1033,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
"page_number": 4,
"emphasized_texts": [
{
"text": "PAGE",
"tag": "span"
}
]
},
"text": "PAGE"
},
@ -753,7 +1059,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
"page_number": 4,
"emphasized_texts": [
{
"text": "Press k for a shareable link to your page or post.",
"tag": "span"
},
{
"text": "Press",
"tag": "strong"
},
{
"text": "Press",
"tag": "span"
},
{
"text": "for a shareable link to your page or post.",
"tag": "strong"
},
{
"text": "for a shareable link to your page or post.",
"tag": "span"
}
]
},
"text": "Press k for a shareable link to your page or post."
},
@ -799,7 +1127,29 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
"page_number": 4,
"emphasized_texts": [
{
"text": "Type /table to add a table at any time.",
"tag": "span"
},
{
"text": "Type",
"tag": "strong"
},
{
"text": "Type",
"tag": "span"
},
{
"text": "to add a table at any time.",
"tag": "strong"
},
{
"text": "to add a table at any time.",
"tag": "span"
}
]
},
"text": "Type /table to add a table at any time."
},
@ -845,7 +1195,21 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
"page_number": 4,
"emphasized_texts": [
{
"text": "DRAFT",
"tag": "span"
},
{
"text": "Only you can see your draft, unless you share a link to it.",
"tag": "strong"
},
{
"text": "Only you can see your draft, unless you share a link to it.",
"tag": "span"
}
]
},
"text": "DRAFTOnly you can see your draft, unless you share a link to it."
},
@ -887,7 +1251,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 5
"page_number": 5,
"emphasized_texts": [
{
"text": "Whats next?",
"tag": "strong"
},
{
"text": "Whats next?",
"tag": "span"
}
]
},
"text": "Whats next?"
},
@ -917,7 +1291,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 5
"page_number": 5,
"emphasized_texts": [
{
"text": "Explore a Space",
"tag": "span"
}
]
},
"text": "Explore a Space"
},
@ -927,7 +1307,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 5
"page_number": 5,
"emphasized_texts": [
{
"text": "Create a page",
"tag": "span"
}
]
},
"text": "Create a page"
},
@ -937,7 +1323,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 5
"page_number": 5,
"emphasized_texts": [
{
"text": "Invite a teammate",
"tag": "span"
}
]
},
"text": "Invite a teammate"
},

View File

@ -5,7 +5,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Say hello to your colleagues who want to know your name, pronouns, role, team and location (or if you're remote).",
"tag": "span"
}
]
},
"text": "Say hello to your colleagues who want to know your name, pronouns, role, team and location (or if you're remote)."
},
@ -45,6 +51,12 @@
"text": "ryan",
"url": "/wiki/display/~64083457896d10ebd4738661"
}
],
"emphasized_texts": [
{
"text": "•",
"tag": "span"
}
]
},
"text": "Overview\n \n \n Jun 30, 2023 • contributed by ryan"
@ -65,6 +77,12 @@
"text": "ryan",
"url": "/wiki/display/~64083457896d10ebd4738661"
}
],
"emphasized_texts": [
{
"text": "•",
"tag": "span"
}
]
},
"text": "Getting started in Confluence\n \n \n Jun 30, 2023 • contributed by ryan"
@ -85,6 +103,12 @@
"text": "ryan",
"url": "/wiki/display/~64083457896d10ebd4738661"
}
],
"emphasized_texts": [
{
"text": "•",
"tag": "span"
}
]
},
"text": "ryan\n \n \n Jun 30, 2023 • contributed by ryan"
@ -135,7 +159,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "End with a bang! Some options are:",
"tag": "span"
},
{
"text": "<Insert company name>",
"tag": "span"
}
]
},
"text": "End with a bang! Some options are: \"I am so grateful to be here at <Insert company name> and very excited to get started!\" or \"Looking forward to meeting all of you!\" or \"Can't wait to get to know all of you!\""
}

View File

@ -5,7 +5,13 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "Say hello to your colleagues who want to know your name, pronouns, role, team and location (or if you're remote).",
"tag": "span"
}
]
},
"text": "Say hello to your colleagues who want to know your name, pronouns, role, team and location (or if you're remote)."
},
@ -45,6 +51,12 @@
"text": "ahmet",
"url": "/wiki/display/~712020%3A5368eedf-cecd-43e1-8b25-b2221316ee6f"
}
],
"emphasized_texts": [
{
"text": "•",
"tag": "span"
}
]
},
"text": "Overview\n \n \n Jul 12, 2023 • contributed by ahmet"
@ -65,6 +77,12 @@
"text": "ahmet",
"url": "/wiki/display/~712020%3A5368eedf-cecd-43e1-8b25-b2221316ee6f"
}
],
"emphasized_texts": [
{
"text": "•",
"tag": "span"
}
]
},
"text": "ahmet\n \n \n Jul 12, 2023 • contributed by ahmet"
@ -115,7 +133,17 @@
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
"page_number": 1,
"emphasized_texts": [
{
"text": "End with a bang! Some options are:",
"tag": "span"
},
{
"text": "<Insert company name>",
"tag": "span"
}
]
},
"text": "End with a bang! Some options are: \"I am so grateful to be here at <Insert company name> and very excited to get started!\" or \"Looking forward to meeting all of you!\" or \"Can't wait to get to know all of you!\""
}

View File

@ -15,6 +15,12 @@
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -15,6 +15,12 @@
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -15,6 +15,12 @@
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -15,6 +15,12 @@
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -15,6 +15,12 @@
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -13,7 +13,13 @@
"sent_to": [
"devops@unstructuredio.onmicrosoft.com"
],
"subject": "subfolder1_1"
"subject": "subfolder1_1",
"emphasized_texts": [
{
"text": "this is a message for the subfolder1_1",
"tag": "span"
}
]
},
"text": "this is a message for the subfolder1_1"
}

View File

@ -146,6 +146,9 @@ class ElementMetadata:
# MSFT Word specific metadata fields
header_footer_type: Optional[str] = None
# Formatting metadata fields
emphasized_texts: Optional[List[dict]] = None
# Text format metadata fields
text_as_html: Optional[str] = None

View File

@ -48,6 +48,7 @@ class TagsMixin:
tag: Optional[str] = None,
ancestortags: Sequence[str] = (),
links: Sequence[Link] = [],
emphasized_texts: Sequence[dict] = [],
**kwargs,
):
if tag is None:
@ -56,6 +57,7 @@ class TagsMixin:
self.tag = tag
self.ancestortags = ancestortags
self.links = links
self.emphasized_texts = emphasized_texts
super().__init__(*args, **kwargs)
@ -132,7 +134,8 @@ class HTMLDocument(XMLDocument):
elif _is_container_with_text(tag_elem):
links = _get_links_from_tag(tag_elem)
element = _text_to_element(tag_elem.text, "div", (), links)
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
element = _text_to_element(tag_elem.text, "div", (), links, emphasized_texts)
if element is not None:
page.elements.append(element)
@ -232,6 +235,25 @@ def _get_links_from_tag(tag_elem: etree.Element) -> List[Link]:
return links
def _get_emphasized_texts_from_tag(tag_elem: etree.Element) -> List[dict]:
emphasized_texts = []
tags_to_track = ["strong", "em", "span", "b", "i"]
if tag_elem is None:
return []
if tag_elem.tag in tags_to_track:
text = _construct_text(tag_elem, False)
if text:
emphasized_texts.append({"text": text, "tag": tag_elem.tag})
for descendant_tag_elem in tag_elem.iterdescendants(*tags_to_track):
text = _construct_text(descendant_tag_elem, False)
if text:
emphasized_texts.append({"text": text, "tag": descendant_tag_elem.tag})
return emphasized_texts
def _parse_tag(
tag_elem: etree.Element,
) -> Optional[Element]:
@ -241,13 +263,20 @@ def _parse_tag(
but we don't have a use for them at the moment."""
ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
links = _get_links_from_tag(tag_elem)
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
if tag_elem.tag == "script":
return None
text = _construct_text(tag_elem)
if not text:
return None
return _text_to_element(text, tag_elem.tag, ancestortags, links=links)
return _text_to_element(
text,
tag_elem.tag,
ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
def _text_to_element(
@ -255,6 +284,7 @@ def _text_to_element(
tag: str,
ancestortags: Tuple[str, ...],
links: List[Link] = [],
emphasized_texts: List[dict] = [],
) -> Optional[Element]:
"""Given the text of an element, the tag type and the ancestor tags, produces the appropriate
HTML element."""
@ -266,18 +296,43 @@ def _text_to_element(
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
elif is_us_city_state_zip(text):
return HTMLAddress(text=text, tag=tag, ancestortags=ancestortags, links=links)
return HTMLAddress(
text=text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
if len(text) < 2:
return None
elif is_narrative_tag(text, tag):
return HTMLNarrativeText(text, tag=tag, ancestortags=ancestortags, links=links)
return HTMLNarrativeText(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
elif is_possible_title(text):
return HTMLTitle(text, tag=tag, ancestortags=ancestortags, links=links)
return HTMLTitle(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
else:
return HTMLText(text, tag=tag, ancestortags=ancestortags, links=links)
return HTMLText(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
def _is_container_with_text(tag_elem: etree.Element) -> bool:
@ -303,14 +358,14 @@ def is_narrative_tag(text: str, tag: str) -> bool:
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
def _construct_text(tag_elem: etree.Element) -> str:
def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) -> str:
"""Extracts text from a text tag element."""
text = ""
for item in tag_elem.itertext():
if item:
text += item
if tag_elem.tail:
if include_tail_text and tag_elem.tail:
text = text + tag_elem.tail
text = replace_unicode_quotes(text)
@ -351,7 +406,16 @@ def _process_list_item(
if tag_elem.tag in LIST_ITEM_TAGS:
text = _construct_text(tag_elem)
links = _get_links_from_tag(tag_elem)
return HTMLListItem(text=text, tag=tag_elem.tag, links=links), tag_elem
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
return (
HTMLListItem(
text=text,
tag=tag_elem.tag,
links=links,
emphasized_texts=emphasized_texts,
),
tag_elem,
)
elif tag_elem.tag == "div":
text = _construct_text(tag_elem)

View File

@ -89,7 +89,7 @@ class XMLDocument(Document):
text_content.append(element.text)
for text in text_content:
element = etree.Element("span")
element = etree.Element("p")
element.text = str(element_from_text(text=text))
document_tree.append(element)

View File

@ -162,6 +162,11 @@ def _add_element_metadata(
else None
)
links = element.links if hasattr(element, "links") and len(element.links) > 0 else None
emphasized_texts = (
element.emphasized_texts
if hasattr(element, "emphasized_texts") and len(element.emphasized_texts) > 0
else None
)
metadata = ElementMetadata(
coordinates=coordinates_metadata,
filename=filename,
@ -170,6 +175,7 @@ def _add_element_metadata(
url=url,
text_as_html=text_as_html,
links=links,
emphasized_texts=emphasized_texts,
)
element.metadata = metadata.merge(element.metadata)
return element

View File

@ -8,7 +8,13 @@ class Properties(TypedDict):
dataType: List[str]
exclude_metadata_keys = ("data_source", "coordinates", "links", "regex_metadata")
exclude_metadata_keys = (
"data_source",
"coordinates",
"links",
"regex_metadata",
"emphasized_texts",
)
def stage_for_weaviate(elements: List[Text]) -> List[Dict[str, Any]]: