unstructured/examples/custom-layout-order/visualise_and_reorder.ipynb

1071 lines
4.9 MiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"id": "8c7a952c-1b24-4e6f-843c-bd01cf02ea37",
"metadata": {},
"source": [
"The purpose of this notebook is to provide two helper methods (plot and reorder) that can be customised to fit specific document layouts. This type of solutions only apply to textual-images (jpeg or png)."
]
},
{
"cell_type": "markdown",
"id": "6c621f72-9e15-41c5-98c0-d999da295b90",
"metadata": {},
"source": [
"### Dependencies and Methods"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f49030e9-81e4-4fae-99ec-19be2ee8e610",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"from unstructured_inference.inference.layout import DocumentLayout\n",
"from PIL import Image\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.patches as patches"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "47fb2b0b-7d86-443b-9fe3-e993d38aa24b",
"metadata": {},
"outputs": [],
"source": [
"def plot_image_with_bounding_boxes_coloured(image_path, bounding_boxes, text_labels=None, desired_width=None):\n",
" # Load the image\n",
" image = Image.open(image_path)\n",
"\n",
" if desired_width:\n",
" # Calculate the desired height based on the original aspect ratio\n",
" aspect_ratio = image.width / image.height\n",
" desired_height = desired_width / aspect_ratio\n",
"\n",
" # Create a figure with the desired size and aspect ratio\n",
" fig, ax = plt.subplots(figsize=(desired_width, desired_height))\n",
" else:\n",
" # Create figure and axes\n",
" fig, ax = plt.subplots()\n",
"\n",
" # Display the image\n",
" ax.imshow(image)\n",
"\n",
" # Split the image vertically\n",
" image_width = image.width\n",
" vertical_line_x = image_width // 2\n",
"\n",
" # Add bounding boxes and custom text labels to the plot\n",
" for bbox, label in zip(bounding_boxes, text_labels):\n",
" # Extract coordinates\n",
" x_min, y_min = bbox[0]\n",
" x_max, y_max = bbox[2]\n",
"\n",
" # Calculate width and height\n",
" width = x_max - x_min\n",
" height = y_max - y_min\n",
"\n",
" # Create a rectangle patch\n",
" rect = patches.Rectangle((x_min, y_min), width, height, linewidth=1, edgecolor='black', facecolor='none')\n",
"\n",
" # Determine the color based on the position of the bounding box\n",
" if x_min < vertical_line_x and x_max < vertical_line_x:\n",
" # Left side: color is red\n",
" rect.set_edgecolor('red')\n",
" rect.set_facecolor((1.0, 0.0, 0.0, 0.05))\n",
"\n",
" # Add the rectangle to the plot\n",
" ax.add_patch(rect)\n",
" \n",
" # Add custom text label above the bounding box\n",
" ax.text(x_min, y_min - 5, label, fontsize=12, weight='bold', color='red', bbox=dict(facecolor=(1.0, 1.0, 1.0, 0.8), edgecolor=(0.95, 0.95, 0.95, 0.0), pad=0.5))\n",
" \n",
" elif x_min > vertical_line_x and x_max > vertical_line_x:\n",
" # Right side: color is blue\n",
" rect.set_edgecolor('blue')\n",
" rect.set_facecolor((0.0, 0.0, 1.0, 0.05))\n",
"\n",
" # Add the rectangle to the plot\n",
" ax.add_patch(rect)\n",
" \n",
" # Add custom text label above the bounding box\n",
" ax.text(x_min, y_min - 5, label, fontsize=12, weight='bold', color='blue', bbox=dict(facecolor=(1.0, 1.0, 1.0, 0.8), edgecolor=(0.95, 0.95, 0.95, 0.0), pad=0.5))\n",
" \n",
" else:\n",
" # Spanning both sides: color is green\n",
" rect.set_edgecolor('green')\n",
" rect.set_facecolor((0.0, 1.0, 0.0, 0.05))\n",
"\n",
" # Add the rectangle to the plot\n",
" ax.add_patch(rect)\n",
"\n",
" # Add custom text label above the bounding box\n",
" ax.text(x_min, y_min - 5, label, fontsize=12, weight='bold', color='green', bbox=dict(facecolor=(1.0, 1.0, 1.0, 0.8), edgecolor=(0.95, 0.95, 0.95, 0.0), pad=0.5))\n",
"\n",
" # Draw the vertical line to split the image\n",
" ax.axvline(x=vertical_line_x, color='black', linestyle='--', linewidth=1)\n",
"\n",
" # Show the plot\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fc839a8d-d604-411a-a5e0-0ab849daf2f7",
"metadata": {},
"outputs": [],
"source": [
"def reorder_elements_in_double_columns(image_path, bounding_boxes):\n",
" # todo: if first element of left and \n",
" \n",
" # Load the image\n",
" image = Image.open(image_path)\n",
"\n",
" # Split the image vertically\n",
" image_width = image.width\n",
" vertical_line_x = image_width // 2\n",
"\n",
" # Determine the order of the bounding boxes\n",
" left_boxes = []\n",
" right_boxes = []\n",
" both_sided_boxes = []\n",
"\n",
" new_bounding_boxes_ix = []\n",
" for i, bbox in enumerate(bounding_boxes):\n",
" x_min = bbox[0][0]\n",
" x_max = bbox[-1][0]\n",
" if x_min < vertical_line_x and x_max < vertical_line_x:\n",
" left_boxes.append(i)\n",
" elif x_min > vertical_line_x and x_max > vertical_line_x:\n",
" right_boxes.append(i)\n",
" else:\n",
" both_sided_boxes.append(i)\n",
"\n",
" # Create new order\n",
" new_bounding_boxes_ix.extend(both_sided_boxes)\n",
" new_bounding_boxes_ix.extend(left_boxes)\n",
" new_bounding_boxes_ix.extend(right_boxes)\n",
" return new_bounding_boxes_ix"
]
},
{
"cell_type": "markdown",
"id": "f4d98f67-cb88-4d82-8fb9-8c4c735497d1",
"metadata": {},
"source": [
"### Federal Regulation Documents"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2e4de53c-ef60-4dfa-93b9-4c99045eef87",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Width: 1248\n",
"Height: 1664\n"
]
}
],
"source": [
"image_path = \"../../example-docs/double-column-A.jpg\"\n",
"image = Image.open(image_path)\n",
"layout = DocumentLayout.from_image_file(image_path) # from_file for pdfs\n",
"width, height = image.size\n",
"print(\"Width:\", width)\n",
"print(\"Height:\", height)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "260bb85b-c753-43d4-a60b-906d8602781c",
"metadata": {},
"outputs": [],
"source": [
"elements = layout.pages[0].elements"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "820d55b2-a12e-4eaa-ae0b-ead399e37371",
"metadata": {},
"outputs": [],
"source": [
"elements_coordinates =[e.to_dict()['coordinates'] for e in elements]\n",
"elements_types = [f\"{ix}: {e.to_dict()['type']}\" for ix, e in enumerate(elements, start=1)]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3176c13e-e15c-4dbc-9890-8ed6dad8811a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABG0AAAXMCAYAAACbWXE4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOz9d5xW9Z3//z/e55yrzDWFoffemxgYEcUKNhTRZGPKmjXZJGKKKbv+skn2u4m4e8sm2U+y6pZEMXFjTNSYGCNgx4KAIF2kg/SZoQ1Mn6udc35/XHOO14WIWK+JPO+5EeAq57yuKu/nvN+vt/F9HxERERERERER6VisYhcgIiIiIiIiIiJvptBGRERERERERKQDUmgjIiIiIiIiItIBKbQREREREREREemAFNqIiIiIiIiIiHRACm1ERERERERERDqgDz20McZcYYzZaozZYYz53od9fhERERERERGRvwbG9/0P72TG2MA24FJgP7AS+Kzv+5s+tCJERERERERERP4KfNgzbSYDO3zf3+n7fhp4CLjmQ65BRERERERERKTDcz7k8/UF9uX9fT9w9lvduFu3bv6gQYM+6JpE3iP/FP58ostM3i/yfn+r24qIyIehpqaGPn36FLsMEREROY2sXr36iO/73Y+//MMObd6WMWY2MBtgwIABrFq1qsgViZyIzxuhip/3y4DvneQ+3hu3wwJs3hTemDeO64fnMBgsFN6IiHzwjDHU1NQUuwwRERE5jRhj9pzo8g87tKkG+uf9vV/7ZSHf9+cCcwGqqqo+vIY7Iu/ICWbM+HnBy9tNuMmXP9mm4E4m784Ka0RERERERE43H3ZPm5XAcGPMYGNMFPgMMO9DrkHkfWAKf/knmAWTPxnnrS4z+Vd4YIKZOMHxbApn44iIyAftxhtvLHYJIiIiIsCHPNPG9/2sMeZm4GlyI9F7fd/f+GHWIPL+OIUA5UQ3MXlzZ0yQ4LjtS6KCa+zcb75VeAhlNiIiH4q5c+cWuwQRERERoAg9bXzffwJ44sM+r8j76eQrnvJnzLi80ccGwMbDwsLDN8F1Wd6YXWMBEQyR3J+PP5GCGxGRD9ykSZNYvXp1scsQERER6XiNiOWjw/dPHG0Y8+6Sh+B4p3L/48/9dvc5/tincq7Cc5gT/tkAGB+/vTmxbxx83+ACxvfBeHm39sH3MWH74fbj+waM8hoRkQ/LmjVril2CiIiICKDQRj5g7zQ8ebv7v9vbv10d7yQQCm6fSmWwLBvHsQAf3zV4PviewbJtbBs838ktifLB9Xw8H7JZg+c7xGIxLBsMLgYP38uSyWbAyxKPRUilUhjfIhaNgKXIRkRERERE5HSj0EY+EEEIYlnWmy7PD1De7ayb48/zTm6Tf878sCb/8pPX5ZPNpolEbYyxaGlJ42YA3yWTzv3yPEinXRobMriuSzbrU9+Qor4hg+dHiMQM3bo4DBjYiZ69OhGJQlNjkspOZdgRCx+fWKy9AY7xACuYtnPKz42IiLw7vXv3LnYJIiIiIoBCG3kbbxeKnPryocLL3m1Y81b3O1kQdHwg81b1nKjeE93PByIRm5ZkG9FoKdFYhJ37DrNt6z62bNrL4YNNJFs9GhvbaGpqI5XM4GY9MA5Zt4Ssa5POtBGJeIwc2Z9p08czefJA+vStwM3kutxEYuC6LpgsxgfLioLRx1VE5MNQU1NT7BJEREREAIU2UiSnGtq83e2CsOVEt8vvT+N53psuC34/0cybtz+3hcHB9wxuFiorE/Tq2Y01K19nxStrwI/Ro3sf+vcdRM9e3elc6RCL2diRruzd6/Hq+g3sr97FurXb2L+vmhef78LlMyZx6SVjKEnYpNMQi0XAeLhuBh9Pc2xERD4kc+bMYc6cOcUuQ0REREShjXxwgtAjPyh5r8uhADzPKzj2CWfD5C3DOtHsnvyw59SXRYW3IusaYtESfB8iDsRjNps2bWfPnp2cOXEcU86ewoQzelBaWkZ5eQm2Y7F/fzPPvfAqu/fXcPjYLjJeE5EI1Dce4ci6PWSyjQwf3p9xYysxgOf6uL6H49hoWZSIyIfntttuU2gjIiIiHYJCG3lbJ1si9VYhR3C553m5ZT7k+tu8VY+b4Fd+iHKi0Cc4nmVZ2LZdUF/+OfOPGZz3+Bk1wfk8zyOTyWCMIRKJYFnWKSwLI9zhqbo6xaOPLmDlyrXMmHEJl0w/m06VCWzHkEx5lJVZHD1Wz5otz1N9dB8DRvfkjLPOpKSknJ7demLbDnt27WL3zu08/8JSmhtHMrFqOLG4hW1igIcx1knrERERERERkY8e80535/kwVVVV+atWrSp2GR8Zb9VjxnVdHMcJ/57JZIhGowBks1kymQyWZeF5XsEyI8dxcBwHz/MK7p/NZnEch0wmQyaTCWfGRCIRAGzbDm+fzWbDUCebzQJvNC92HAdjDK7rEolE8DyPVCoVBjGxWAzHccIAJpvNhuFMUHd+vZFIJAx+gmAnGo2SyWRoa2vDsixKSkqIRqPYtn3SWTeZrMED6o9m+J//uZdtO7bw9a/9PZOqxmI7Fk4Yh/p4fhKMi+tm8fwoGT+CjY1jGXwfsilDxPJJtnns2nGMhx76HRUVJXzmM59g4OBuOBGDsWjf9lszbkREPmhv1edMRERE5INijFnt+37Vmy7vyP8oUWjz/snv65JOp4nFYmSzWdasWcPzzz/PV77yFUpLS9m7dy/f//73uf/++4lEIqxYsYLXXnuNc889l+XLl7Nhwwaam5sZNmwYEyZMwBjDa6+9xuc+9zl69+7N9u3b+eUvf8lXv/pVnn76afbu3UuvXr3o3LkzM2fO5MEHHySVSvHFL36Rzp0789hjj1FfX8+FF17I/Pnzqa2tJZvNUlpayqc+9Smampp49NFH+eEPf8jq1at54okn6Nu3Ly0tLYwfP54rrrgCy7I4cuQId911F5deeimTJk1i+/btPP300+zatQvf9+nevTt/+7d/y+rVq1m4cCFjxoyhS5cuXHrppfz2t7+jqamZ8vIEra0tdO5cyac//Wl69+6N74Pngef5WMaQyeQCKmMZDhxOc++9f+L1XZv4/33nq4wa2Rff87EMJFNpSuIxDD6+77W/Bh6+nQbfBj+O73tkMm2URMsgC3iQSvkY2+Xe3zzNS4tW8ZnrZ3LRtDOoKItgmdwMHxER+WCtXr2aSZMmFbsMEREROY28VWij5VGniWA2im3bxGIxjDE0NDTw8ssv09LSwrPPPsvVV1+N7/uMHz8eyIU7iUSCRCJB7969+dKXvsTixYtZtWoV3/rWt8hkMixZsgTHcWhububo0aPhrJ1u3bpRXl7OtGnTGDZsGF27diUSiVBZWcmiRYv485//zHXXXUefPn1IJBIMHDiQb3zjGzz55JOUl5dz1llnEY1GefXVV+nfvz/79+/nscce45Of/CSTJ0+mrq6OY8eO0djYSDweZ8eOHVRWVvLAAw/Qq1cvhg4dype//GUWLFhAaWkp06ZNI5FIsGvXLgYOHMjHP/5xHMehe7fudKqoZMyYMcyceSWNjQ3MnXs3f/rTn/jqV7/avgTLEInYNNS3UNmpnGTSp6UtwxOPL6em9iA/+OH36d69JPdTWeNiDMRjBgufdCaFbWxs2wJjkc3GMZZFOtW+bMuPcLD2GNmkhe86JEqipPwss66+kPVrD/OLX/yJRHmMiy8Yi+UosRERERERETmdKLQ5TQRTvT3PC5cvzZ8/n65duzJixAiWLl3K6NGjKS8vp7GxEc/zSCQSAMRiMaLRaNj7BXIhUCwWw7ZtDhw4wP79+zl69Ch1dXWUlJRgjCGTybB161YOHjzIlClT6NOnD7179+aGG27ghRdeYN68eQwYMADXdXFdN1yyFPSg8TwvDJgOHjyI4zj07t0by7Lo1q0bnTt3xnEcdu7cyaZNmxg2bBg1NTUsWLCAr33ta+FSKsdxwqVRiUSC/fv3s27dOvr370/v3r2JRqOUlJTgui6lpWVMnjyZZcuW0draSllZGY7jkM26xONRsq6P78PqVVupqa7h5q/9Hb17lGI7Pvge4OL5Lo5lYchgGRfLNoA
"text/plain": [
"<Figure size 1440x1920 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plot_image_with_bounding_boxes_coloured(image_path, elements_coordinates, elements_types, desired_width=20)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "3f4ec817-0521-44e7-a5b0-f9f28277c338",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0, 2, 3, 4, 6, 7, 11, 13, 1, 5, 8, 9, 10, 12]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_ixs = reorder_elements_in_double_columns(image_path, elements_coordinates)\n",
"new_ixs"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "17c4c481-b950-4be4-979b-06ce56595eba",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"🔥 element index: 0\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 387.199541015625,\n",
" 369.36095448369565\n",
" ],\n",
" [\n",
" 387.199541015625,\n",
" 402.3356487771739\n",
" ],\n",
" [\n",
" 855.3382690429688,\n",
" 402.3356487771739\n",
" ],\n",
" [\n",
" 855.3382690429688,\n",
" 369.36095448369565\n",
" ]\n",
" ],\n",
" \"text\": \"SUBCHAPTER A\\u2014CIVIL SERVICE RULES\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 387.199541015625,\n",
" 369.36095448369565\n",
" ],\n",
" [\n",
" 387.199541015625,\n",
" 402.3356487771739\n",
" ],\n",
" [\n",
" 855.3382690429688,\n",
" 402.3356487771739\n",
" ],\n",
" [\n",
" 855.3382690429688,\n",
" 369.36095448369565\n",
" ]\n",
" ],\n",
" \"text\": \"SUBCHAPTER A\\u2014CIVIL SERVICE RULES\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥 element index: 1\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 634.0365966796875,\n",
" 438.10726147342996\n",
" ],\n",
" [\n",
" 634.0365966796875,\n",
" 777.8485431763285\n",
" ],\n",
" [\n",
" 980.589873046875,\n",
" 777.8485431763285\n",
" ],\n",
" [\n",
" 980.589873046875,\n",
" 438.10726147342996\n",
" ]\n",
" ],\n",
" \"text\": \"(c) Competitive status shall mean basic\\neligibility to be noncompetitively se-\\nlected to fill a vacancy in a competi-\\ntive position. A competitive status\\nshall be acquired by career-conditional\\nor career appointment through open\\ncompetitive examination upon satis-\\nfactory completion of a probationary\\nperiod, or may be granted by statute,\\nexecutive order, or the Civil Service\\nRules without competitive examina-\\ntion. A person with competitive status\\nmay be promoted, transferred, reas-\\nsigned, reinstated, or demoted without\\ntaking an open competitive examina-\\ntion, subject to the conditions pre-\\nscribed by the Civil Service Rules and\\nRegulations.\\n\\n(a) An employee shall be considered\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 262.8152014160156,\n",
" 501.94208937198067\n",
" ],\n",
" [\n",
" 262.8152014160156,\n",
" 681.1295440821256\n",
" ],\n",
" [\n",
" 612.7875476074219,\n",
" 681.1295440821256\n",
" ],\n",
" [\n",
" 612.7875476074219,\n",
" 501.94208937198067\n",
" ]\n",
" ],\n",
" \"text\": \"Sec.\\n\\n1.1 Positions and employees affected by the\\nrules in this subchapter.\\n\\n1.2 Extent of the competitive service.\\n\\n1.3 Definitions.\\n\\n1.4 Extent of the excepted service.\\n\\nU.S.C. 3301, 3302.\\n\\nSOURCE: 28 FR 10022, Sept. 14, 1963, unless\\notherwise noted.\\n\\nAUTHORITY:\\n\\n\",\n",
" \"type\": \"List\"\n",
"}\n",
"\n",
"🔥 element index: 2\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 262.8152014160156,\n",
" 501.94208937198067\n",
" ],\n",
" [\n",
" 262.8152014160156,\n",
" 681.1295440821256\n",
" ],\n",
" [\n",
" 612.7875476074219,\n",
" 681.1295440821256\n",
" ],\n",
" [\n",
" 612.7875476074219,\n",
" 501.94208937198067\n",
" ]\n",
" ],\n",
" \"text\": \"Sec.\\n\\n1.1 Positions and employees affected by the\\nrules in this subchapter.\\n\\n1.2 Extent of the competitive service.\\n\\n1.3 Definitions.\\n\\n1.4 Extent of the excepted service.\\n\\nU.S.C. 3301, 3302.\\n\\nSOURCE: 28 FR 10022, Sept. 14, 1963, unless\\notherwise noted.\\n\\nAUTHORITY:\\n\\n\",\n",
" \"type\": \"List\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 271.1450244140625,\n",
" 693.3728298611111\n",
" ],\n",
" [\n",
" 271.1450244140625,\n",
" 731.4510152475846\n",
" ],\n",
" [\n",
" 613.0182055664063,\n",
" 731.4510152475846\n",
" ],\n",
" [\n",
" 613.0182055664063,\n",
" 693.3728298611111\n",
" ]\n",
" ],\n",
" \"text\": \"1.1 Positions and employees affected\\nby the rules in this subchapter.\\n\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥 element index: 3\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 271.1450244140625,\n",
" 693.3728298611111\n",
" ],\n",
" [\n",
" 271.1450244140625,\n",
" 731.4510152475846\n",
" ],\n",
" [\n",
" 613.0182055664063,\n",
" 731.4510152475846\n",
" ],\n",
" [\n",
" 613.0182055664063,\n",
" 693.3728298611111\n",
" ]\n",
" ],\n",
" \"text\": \"1.1 Positions and employees affected\\nby the rules in this subchapter.\\n\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 268.2219287109375,\n",
" 735.2503358997584\n",
" ],\n",
" [\n",
" 268.2219287109375,\n",
" 887.1597222222222\n",
" ],\n",
" [\n",
" 613.424296875,\n",
" 887.1597222222222\n",
" ],\n",
" [\n",
" 613.424296875,\n",
" 735.2503358997584\n",
" ]\n",
" ],\n",
" \"text\": \"il\\n\\nThe rules in this subchapter shall\\napply to all positions in the competi-\\ntive service and to all incumbents of\\nsuch positions. Except as expressly pro-\\nvided in the rule concerned, the rules\\nin this subchapter shall not apply to\\npositions and employees in the ex-\\ncepted service.\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥 element index: 4\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 268.2219287109375,\n",
" 735.2503358997584\n",
" ],\n",
" [\n",
" 268.2219287109375,\n",
" 887.1597222222222\n",
" ],\n",
" [\n",
" 613.424296875,\n",
" 887.1597222222222\n",
" ],\n",
" [\n",
" 613.424296875,\n",
" 735.2503358997584\n",
" ]\n",
" ],\n",
" \"text\": \"il\\n\\nThe rules in this subchapter shall\\napply to all positions in the competi-\\ntive service and to all incumbents of\\nsuch positions. Except as expressly pro-\\nvided in the rule concerned, the rules\\nin this subchapter shall not apply to\\npositions and employees in the ex-\\ncepted service.\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 271.1893707275391,\n",
" 905.4283137077294\n",
" ],\n",
" [\n",
" 271.1893707275391,\n",
" 925.8976222826086\n",
" ],\n",
" [\n",
" 609.5326757812501,\n",
" 925.8976222826086\n",
" ],\n",
" [\n",
" 609.5326757812501,\n",
" 905.4283137077294\n",
" ]\n",
" ],\n",
" \"text\": \"12 Extent of the competitive service.\\n\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥 element index: 5\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 633.5991796875,\n",
" 775.5539175724638\n",
" ],\n",
" [\n",
" 633.5991796875,\n",
" 995.8069670893719\n",
" ],\n",
" [\n",
" 978.7497509765625,\n",
" 995.8069670893719\n",
" ],\n",
" [\n",
" 978.7497509765625,\n",
" 775.5539175724638\n",
" ]\n",
" ],\n",
" \"text\": \"(d) An employee shall be considered\\nas being in the competitive service\\nwhen he has a competitive status and\\noccupies a competitive position unless\\nhe is serving under a temporary ap-\\npointment: Provided, that an employee\\nwho is in the competitive service at the\\ntime his position is first listed under\\nSchedule A, B, or C shall be considered\\nas continuing in the competitive serv-\\nice as long as he continues to occupy\\nsuch position.\\n\\nfe) Tenure shall mean the neriad af\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 264.66160766601564,\n",
" 919.8008303140097\n",
" ],\n",
" [\n",
" 264.66160766601564,\n",
" 1255.5389794685989\n",
" ],\n",
" [\n",
" 616.7077331542969,\n",
" 1255.5389794685989\n",
" ],\n",
" [\n",
" 616.7077331542969,\n",
" 919.8008303140097\n",
" ]\n",
" ],\n",
" \"text\": \"31.2 Extent of the competitive service.\\n\\nThe competitive service shall in-\\nclude: (a) All civilian positions in the\\nexecutive branch of the Government\\nunless specifically excepted therefrom\\nby or pursuant to statute or by the Of-\\nfice of Personnel Management (here-\\nafter referred to in this subchapter as\\nOPM) under \\u00a76.1 of this subchapter;\\nand (b) all positions in the legislative\\nand judicial branches of the Federal\\nGovernment and in the Government of\\nthe District of Columbia which are spe-\\ncifically made subject to the civil serv-\\nice laws by statute. OPM is authorized\\nand directed to determine finally\\nwhether a position is in the competi-\\ntive service.\\n\",\n",
" \"type\": \"List\"\n",
"}\n",
"\n",
"🔥 element index: 6\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 271.1893707275391,\n",
" 905.4283137077294\n",
" ],\n",
" [\n",
" 271.1893707275391,\n",
" 925.8976222826086\n",
" ],\n",
" [\n",
" 609.5326757812501,\n",
" 925.8976222826086\n",
" ],\n",
" [\n",
" 609.5326757812501,\n",
" 905.4283137077294\n",
" ]\n",
" ],\n",
" \"text\": \"12 Extent of the competitive service.\\n\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 272.54268310546877,\n",
" 1335.6573671497583\n",
" ],\n",
" [\n",
" 272.54268310546877,\n",
" 1447.6836503623188\n",
" ],\n",
" [\n",
" 611.55046875,\n",
" 1447.6836503623188\n",
" ],\n",
" [\n",
" 611.55046875,\n",
" 1335.6573671497583\n",
" ]\n",
" ],\n",
" \"text\": \"Reema tar cian\\n\\n(a) Competitive service shall have the\\nsame meaning as the words \\u201cclassified\\nservice\\u2019, or \\u201cclassified (competitive)\\nservice\\u2019, or \\u201cclassified civil service\\u201d as\\ndefined in existing statutes and execu-\\ntive orders.\\n\\npp aioe\\n\\nBier: ee ce\\n\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥 element index: 7\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 264.66160766601564,\n",
" 919.8008303140097\n",
" ],\n",
" [\n",
" 264.66160766601564,\n",
" 1255.5389794685989\n",
" ],\n",
" [\n",
" 616.7077331542969,\n",
" 1255.5389794685989\n",
" ],\n",
" [\n",
" 616.7077331542969,\n",
" 919.8008303140097\n",
" ]\n",
" ],\n",
" \"text\": \"31.2 Extent of the competitive service.\\n\\nThe competitive service shall in-\\nclude: (a) All civilian positions in the\\nexecutive branch of the Government\\nunless specifically excepted therefrom\\nby or pursuant to statute or by the Of-\\nfice of Personnel Management (here-\\nafter referred to in this subchapter as\\nOPM) under \\u00a76.1 of this subchapter;\\nand (b) all positions in the legislative\\nand judicial branches of the Federal\\nGovernment and in the Government of\\nthe District of Columbia which are spe-\\ncifically made subject to the civil serv-\\nice laws by statute. OPM is authorized\\nand directed to determine finally\\nwhether a position is in the competi-\\ntive service.\\n\",\n",
" \"type\": \"List\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 264.3911499023438,\n",
" 1448.8040760869565\n",
" ],\n",
" [\n",
" 264.3911499023438,\n",
" 1488.7708333333333\n",
" ],\n",
" [\n",
" 617.1382470703126,\n",
" 1488.7708333333333\n",
" ],\n",
" [\n",
" 617.1382470703126,\n",
" 1448.8040760869565\n",
" ]\n",
" ],\n",
" \"text\": \"ge ee ee\\n(b) Competitive position shall mean a\\nposition in the competitive service.\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥 element index: 8\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 632.1751940917969,\n",
" 997.1144248188406\n",
" ],\n",
" [\n",
" 632.1751940917969,\n",
" 1180.829898852657\n",
" ],\n",
" [\n",
" 983.0160424804687,\n",
" 1180.829898852657\n",
" ],\n",
" [\n",
" 983.0160424804687,\n",
" 997.1144248188406\n",
" ]\n",
" ],\n",
" \"text\": \"ee woe\\n\\n(e) Tenure shall mean the period of\\ntime an employee may reasonably ex-\\npect to serve under his current ap-\\npointment. Tenure shall be granted and\\ngoverned by the type of appointment\\nunder which an employee is currently\\nserving without regard to whether he\\nhas a competitive status or whether his\\nappointment is to a competitive posi-\\ntion or an excepted position.\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 634.0365966796875,\n",
" 438.10726147342996\n",
" ],\n",
" [\n",
" 634.0365966796875,\n",
" 777.8485431763285\n",
" ],\n",
" [\n",
" 980.589873046875,\n",
" 777.8485431763285\n",
" ],\n",
" [\n",
" 980.589873046875,\n",
" 438.10726147342996\n",
" ]\n",
" ],\n",
" \"text\": \"(c) Competitive status shall mean basic\\neligibility to be noncompetitively se-\\nlected to fill a vacancy in a competi-\\ntive position. A competitive status\\nshall be acquired by career-conditional\\nor career appointment through open\\ncompetitive examination upon satis-\\nfactory completion of a probationary\\nperiod, or may be granted by statute,\\nexecutive order, or the Civil Service\\nRules without competitive examina-\\ntion. A person with competitive status\\nmay be promoted, transferred, reas-\\nsigned, reinstated, or demoted without\\ntaking an open competitive examina-\\ntion, subject to the conditions pre-\\nscribed by the Civil Service Rules and\\nRegulations.\\n\\n(a) An employee shall be considered\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥 element index: 9\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 635.4368737792969,\n",
" 1198.3715579710145\n",
" ],\n",
" [\n",
" 635.4368737792969,\n",
" 1218.726645531401\n",
" ],\n",
" [\n",
" 953.8425000000001,\n",
" 1218.726645531401\n",
" ],\n",
" [\n",
" 953.8425000000001,\n",
" 1198.3715579710145\n",
" ]\n",
" ],\n",
" \"text\": \"\\u00a71.4 Extent of the excepted service.\\n\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 633.5991796875,\n",
" 775.5539175724638\n",
" ],\n",
" [\n",
" 633.5991796875,\n",
" 995.8069670893719\n",
" ],\n",
" [\n",
" 978.7497509765625,\n",
" 995.8069670893719\n",
" ],\n",
" [\n",
" 978.7497509765625,\n",
" 775.5539175724638\n",
" ]\n",
" ],\n",
" \"text\": \"(d) An employee shall be considered\\nas being in the competitive service\\nwhen he has a competitive status and\\noccupies a competitive position unless\\nhe is serving under a temporary ap-\\npointment: Provided, that an employee\\nwho is in the competitive service at the\\ntime his position is first listed under\\nSchedule A, B, or C shall be considered\\nas continuing in the competitive serv-\\nice as long as he continues to occupy\\nsuch position.\\n\\nfe) Tenure shall mean the neriad af\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥 element index: 10\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 633.4870166015626,\n",
" 1223.220320048309\n",
" ],\n",
" [\n",
" 633.4870166015626,\n",
" 1377.5990489130434\n",
" ],\n",
" [\n",
" 979.7055175781251,\n",
" 1377.5990489130434\n",
" ],\n",
" [\n",
" 979.7055175781251,\n",
" 1223.220320048309\n",
" ]\n",
" ],\n",
" \"text\": \"(a) The excepted service shall include\\nall civilian positions in the executive\\nbranch of the Government which are\\nspecifically excepted from the require-\\nments of the Civil Service Act or from\\nthe competitive service by or pursuant\\nto statute or by OPM under \\u00a76.1 of this\\nsubchapter.\\n\\n(b) Excepted service shall have the\\n\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 632.1751940917969,\n",
" 997.1144248188406\n",
" ],\n",
" [\n",
" 632.1751940917969,\n",
" 1180.829898852657\n",
" ],\n",
" [\n",
" 983.0160424804687,\n",
" 1180.829898852657\n",
" ],\n",
" [\n",
" 983.0160424804687,\n",
" 997.1144248188406\n",
" ]\n",
" ],\n",
" \"text\": \"ee woe\\n\\n(e) Tenure shall mean the period of\\ntime an employee may reasonably ex-\\npect to serve under his current ap-\\npointment. Tenure shall be granted and\\ngoverned by the type of appointment\\nunder which an employee is currently\\nserving without regard to whether he\\nhas a competitive status or whether his\\nappointment is to a competitive posi-\\ntion or an excepted position.\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥 element index: 11\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 272.54268310546877,\n",
" 1335.6573671497583\n",
" ],\n",
" [\n",
" 272.54268310546877,\n",
" 1447.6836503623188\n",
" ],\n",
" [\n",
" 611.55046875,\n",
" 1447.6836503623188\n",
" ],\n",
" [\n",
" 611.55046875,\n",
" 1335.6573671497583\n",
" ]\n",
" ],\n",
" \"text\": \"Reema tar cian\\n\\n(a) Competitive service shall have the\\nsame meaning as the words \\u201cclassified\\nservice\\u2019, or \\u201cclassified (competitive)\\nservice\\u2019, or \\u201cclassified civil service\\u201d as\\ndefined in existing statutes and execu-\\ntive orders.\\n\\npp aioe\\n\\nBier: ee ce\\n\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 635.4368737792969,\n",
" 1198.3715579710145\n",
" ],\n",
" [\n",
" 635.4368737792969,\n",
" 1218.726645531401\n",
" ],\n",
" [\n",
" 953.8425000000001,\n",
" 1218.726645531401\n",
" ],\n",
" [\n",
" 953.8425000000001,\n",
" 1198.3715579710145\n",
" ]\n",
" ],\n",
" \"text\": \"\\u00a71.4 Extent of the excepted service.\\n\\n\",\n",
" \"type\": \"Title\"\n",
"}\n",
"\n",
"🔥 element index: 12\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 635.0781518554687,\n",
" 1374.0345486111112\n",
" ],\n",
" [\n",
" 635.0781518554687,\n",
" 1487.53078955314\n",
" ],\n",
" [\n",
" 979.6865698242187,\n",
" 1487.53078955314\n",
" ],\n",
" [\n",
" 979.6865698242187,\n",
" 1374.0345486111112\n",
" ]\n",
" ],\n",
" \"text\": \"deed nears genta\\n\\n(b) Excepted service shall have the\\nsame meaning as the words \\u201c\\u2018unclassi-\\nfied service\\u2019, or \\u201cunclassified civil\\nservice\\u2019, or \\u2018\\u2018positions outside the\\ncompetitive civil service\\u2019 as used in\\nexisting statutes and executive orders.\\n\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 633.4870166015626,\n",
" 1223.220320048309\n",
" ],\n",
" [\n",
" 633.4870166015626,\n",
" 1377.5990489130434\n",
" ],\n",
" [\n",
" 979.7055175781251,\n",
" 1377.5990489130434\n",
" ],\n",
" [\n",
" 979.7055175781251,\n",
" 1223.220320048309\n",
" ]\n",
" ],\n",
" \"text\": \"(a) The excepted service shall include\\nall civilian positions in the executive\\nbranch of the Government which are\\nspecifically excepted from the require-\\nments of the Civil Service Act or from\\nthe competitive service by or pursuant\\nto statute or by OPM under \\u00a76.1 of this\\nsubchapter.\\n\\n(b) Excepted service shall have the\\n\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥 element index: 13\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 264.3911499023438,\n",
" 1448.8040760869565\n",
" ],\n",
" [\n",
" 264.3911499023438,\n",
" 1488.7708333333333\n",
" ],\n",
" [\n",
" 617.1382470703126,\n",
" 1488.7708333333333\n",
" ],\n",
" [\n",
" 617.1382470703126,\n",
" 1448.8040760869565\n",
" ]\n",
" ],\n",
" \"text\": \"ge ee ee\\n(b) Competitive position shall mean a\\nposition in the competitive service.\\n\",\n",
" \"type\": \"Text\"\n",
"}\n",
"\n",
"🔥🔥 now is:\n",
"{\n",
" \"coordinates\": [\n",
" [\n",
" 635.0781518554687,\n",
" 1374.0345486111112\n",
" ],\n",
" [\n",
" 635.0781518554687,\n",
" 1487.53078955314\n",
" ],\n",
" [\n",
" 979.6865698242187,\n",
" 1487.53078955314\n",
" ],\n",
" [\n",
" 979.6865698242187,\n",
" 1374.0345486111112\n",
" ]\n",
" ],\n",
" \"text\": \"deed nears genta\\n\\n(b) Excepted service shall have the\\nsame meaning as the words \\u201c\\u2018unclassi-\\nfied service\\u2019, or \\u201cunclassified civil\\nservice\\u2019, or \\u2018\\u2018positions outside the\\ncompetitive civil service\\u2019 as used in\\nexisting statutes and executive orders.\\n\\n\",\n",
" \"type\": \"Text\"\n",
"}\n"
]
}
],
"source": [
"elements_reord = [elements[i] for i in new_ixs]\n",
"for ix, (e, e_fix) in enumerate(zip(elements, elements_reord)):\n",
" print(f\"\\n🔥 element index: {ix}\")\n",
" print(json.dumps(e.to_dict(), indent=2))\n",
" print(f\"\\n🔥🔥 now is:\")\n",
" print(json.dumps(e_fix.to_dict(), indent=2))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8b590b81-575a-4f7b-93ab-891b49510ee1",
"metadata": {},
"outputs": [],
"source": [
"elements_coordinates_fix =[e.to_dict()['coordinates'] for e in elements_reord]\n",
"elements_types_fix = [f\"{ix}: {e.to_dict()['type']}\" for ix, e in enumerate(elements_reord, start=1)]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2f384b7e-5e9a-4c88-a7aa-8d3df965194e",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABG0AAAXMCAYAAACbWXE4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOz9d5xW9Z3//z/e55yrzDWFoffemxgYEcUKNhTRZGPKmjXZJGKKKbv+skn2u4m4e8sm2U+y6pZEMXFjTNSYGCNgx4KAIF2kg/SZoQ1Mn6udc35/XHOO14WIWK+JPO+5EeAq57yuKu/nvN+vt/F9HxERERERERER6VisYhcgIiIiIiIiIiJvptBGRERERERERKQDUmgjIiIiIiIiItIBKbQREREREREREemAFNqIiIiIiIiIiHRACm1ERERERERERDqgDz20McZcYYzZaozZYYz53od9fhERERERERGRvwbG9/0P72TG2MA24FJgP7AS+Kzv+5s+tCJERERERERERP4KfNgzbSYDO3zf3+n7fhp4CLjmQ65BRERERERERKTDcz7k8/UF9uX9fT9w9lvduFu3bv6gQYM+6JpE3iP/FP58ostM3i/yfn+r24qIyIehpqaGPn36FLsMEREROY2sXr36iO/73Y+//MMObd6WMWY2MBtgwIABrFq1qsgViZyIzxuhip/3y4DvneQ+3hu3wwJs3hTemDeO64fnMBgsFN6IiHzwjDHU1NQUuwwRERE5jRhj9pzo8g87tKkG+uf9vV/7ZSHf9+cCcwGqqqo+vIY7Iu/ICWbM+HnBy9tNuMmXP9mm4E4m784Ka0RERERERE43H3ZPm5XAcGPMYGNMFPgMMO9DrkHkfWAKf/knmAWTPxnnrS4z+Vd4YIKZOMHxbApn44iIyAftxhtvLHYJIiIiIsCHPNPG9/2sMeZm4GlyI9F7fd/f+GHWIPL+OIUA5UQ3MXlzZ0yQ4LjtS6KCa+zcb75VeAhlNiIiH4q5c+cWuwQRERERoAg9bXzffwJ44sM+r8j76eQrnvJnzLi80ccGwMbDwsLDN8F1Wd6YXWMBEQyR3J+PP5GCGxGRD9ykSZNYvXp1scsQERER6XiNiOWjw/dPHG0Y8+6Sh+B4p3L/48/9dvc5/tincq7Cc5gT/tkAGB+/vTmxbxx83+ACxvfBeHm39sH3MWH74fbj+waM8hoRkQ/LmjVril2CiIiICKDQRj5g7zQ8ebv7v9vbv10d7yQQCm6fSmWwLBvHsQAf3zV4PviewbJtbBs838ktifLB9Xw8H7JZg+c7xGIxLBsMLgYP38uSyWbAyxKPRUilUhjfIhaNgKXIRkRERERE5HSj0EY+EEEIYlnWmy7PD1De7ayb48/zTm6Tf878sCb/8pPX5ZPNpolEbYyxaGlJ42YA3yWTzv3yPEinXRobMriuSzbrU9+Qor4hg+dHiMQM3bo4DBjYiZ69OhGJQlNjkspOZdgRCx+fWKy9AY7xACuYtnPKz42IiLw7vXv3LnYJIiIiIoBCG3kbbxeKnPryocLL3m1Y81b3O1kQdHwg81b1nKjeE93PByIRm5ZkG9FoKdFYhJ37DrNt6z62bNrL4YNNJFs9GhvbaGpqI5XM4GY9MA5Zt4Ssa5POtBGJeIwc2Z9p08czefJA+vStwM3kutxEYuC6LpgsxgfLioLRx1VE5MNQU1NT7BJEREREAIU2UiSnGtq83e2CsOVEt8vvT+N53psuC34/0cybtz+3hcHB9wxuFiorE/Tq2Y01K19nxStrwI/Ro3sf+vcdRM9e3elc6RCL2diRruzd6/Hq+g3sr97FurXb2L+vmhef78LlMyZx6SVjKEnYpNMQi0XAeLhuBh9Pc2xERD4kc+bMYc6cOcUuQ0REREShjXxwgtAjPyh5r8uhADzPKzj2CWfD5C3DOtHsnvyw59SXRYW3IusaYtESfB8iDsRjNps2bWfPnp2cOXEcU86ewoQzelBaWkZ5eQm2Y7F/fzPPvfAqu/fXcPjYLjJeE5EI1Dce4ci6PWSyjQwf3p9xYysxgOf6uL6H49hoWZSIyIfntttuU2gjIiIiHYJCG3lbJ1si9VYhR3C553m5ZT7k+tu8VY+b4Fd+iHKi0Cc4nmVZ2LZdUF/+OfOPGZz3+Bk1wfk8zyOTyWCMIRKJYFnWKSwLI9zhqbo6xaOPLmDlyrXMmHEJl0w/m06VCWzHkEx5lJVZHD1Wz5otz1N9dB8DRvfkjLPOpKSknJ7demLbDnt27WL3zu08/8JSmhtHMrFqOLG4hW1igIcx1knrERERERERkY8e80535/kwVVVV+atWrSp2GR8Zb9VjxnVdHMcJ/57JZIhGowBks1kymQyWZeF5XsEyI8dxcBwHz/MK7p/NZnEch0wmQyaTCWfGRCIRAGzbDm+fzWbDUCebzQJvNC92HAdjDK7rEolE8DyPVCoVBjGxWAzHccIAJpvNhuFMUHd+vZFIJAx+gmAnGo2SyWRoa2vDsixKSkqIRqPYtn3SWTeZrMED6o9m+J//uZdtO7bw9a/9PZOqxmI7Fk4Yh/p4fhKMi+tm8fwoGT+CjY1jGXwfsilDxPJJtnns2nGMhx76HRUVJXzmM59g4OBuOBGDsWjf9lszbkREPmhv1edMRERE5INijFnt+37Vmy7vyP8oUWjz/snv65JOp4nFYmSzWdasWcPzzz/PV77yFUpLS9m7dy/f//73uf/++4lEIqxYsYLXXnuNc889l+XLl7Nhwwaam5sZNmwYEyZMwBjDa6+9xuc+9zl69+7N9u3b+eUvf8lXv/pVnn76afbu3UuvXr3o3LkzM2fO5MEHHySVSvHFL36Rzp0789hjj1FfX8+FF17I/Pnzqa2tJZvNUlpayqc+9Smampp49NFH+eEPf8jq1at54okn6Nu3Ly0tLYwfP54rrrgCy7I4cuQId911F5deeimTJk1i+/btPP300+zatQvf9+nevTt/+7d/y+rVq1m4cCFjxoyhS5cuXHrppfz2t7+jqamZ8vIEra0tdO5cyac//Wl69+6N74Pngef5WMaQyeQCKmMZDhxOc++9f+L1XZv4/33nq4wa2Rff87EMJFNpSuIxDD6+77W/Bh6+nQbfBj+O73tkMm2URMsgC3iQSvkY2+Xe3zzNS4tW8ZnrZ3LRtDOoKItgmdwMHxER+WCtXr2aSZMmFbsMEREROY28VWij5VGniWA2im3bxGIxjDE0NDTw8ssv09LSwrPPPsvVV1+N7/uMHz8eyIU7iUSCRCJB7969+dKXvsTixYtZtWoV3/rWt8hkMixZsgTHcWhububo0aPhrJ1u3bpRXl7OtGnTGDZsGF27diUSiVBZWcmiRYv485//zHXXXUefPn1IJBIMHDiQb3zjGzz55JOUl5dz1llnEY1GefXVV+nfvz/79+/nscce45Of/CSTJ0+mrq6OY8eO0djYSDweZ8eOHVRWVvLAAw/Qq1cvhg4dype//GUWLFhAaWkp06ZNI5FIsGvXLgYOHMjHP/5xHMehe7fudKqoZMyYMcyceSWNjQ3MnXs3f/rTn/jqV7/avgTLEInYNNS3UNmpnGTSp6UtwxOPL6em9iA/+OH36d69JPdTWeNiDMRjBgufdCaFbWxs2wJjkc3GMZZFOtW+bMuPcLD2GNmkhe86JEqipPwss66+kPVrD/OLX/yJRHmMiy8Yi+UosRERERERETmdKLQ5TQRTvT3PC5cvzZ8/n65duzJixAiWLl3K6NGjKS8vp7GxEc/zSCQSAMRiMaLRaNj7BXIhUCwWw7ZtDhw4wP79+zl69Ch1dXWUlJRgjCGTybB161YOHjzIlClT6NOnD7179+aGG27ghRdeYN68eQwYMADXdXFdN1yyFPSg8TwvDJgOHjyI4zj07t0by7Lo1q0bnTt3xnEcdu7cyaZNmxg2bBg1NTUsWLCAr33ta+FSKsdxwqVRiUSC/fv3s27dOvr370/v3r2JRqOUlJTgui6lpWVMnjyZZcuW0draSllZGY7jkM26xONRsq6P78PqVVupqa7h5q/9Hb17lGI7Pvge4OL5Lo5lYchgGRfLNoA
"text/plain": [
"<Figure size 1440x1920 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plot_image_with_bounding_boxes_coloured(image_path, elements_coordinates_fix, elements_types_fix, desired_width=20)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "c9ce09d4-a325-4414-b649-c22299e6300d",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABG0AAAXMCAYAAACbWXE4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOzdd3gU16E28He2r6RVL6iAJBASoiOJ3l3oYIw7dlySGOc6/khxnOvcFOPcOPUmTnKTGxtsx93GDdNtuqlGgAwCIboKqAt1afvO98dyDrNCNBuzInl/z6NH0pbZszOzO3PeOUVRVRVERERERERERNS96IJdACIiIiIiIiIiuhBDGyIiIiIiIiKiboihDRERERERERFRN8TQhoiIiIiIiIioG2JoQ0RERERERETUDTG0ISIiIiIiIiLqhq57aKMoyjRFUY4qinJCUZSnr/frExERERERERHdCBRVVa/fiymKHsAxALcCOANgD4D7VFU9fN0KQURERERERER0A7jeLW1GADihquopVVVdAN4FcNt1LgMRERERERERUbdnuM6vlwzgtOb/MwBGXuzBsbGxalpa2tddJiIiIiKpsrISSUlJwS4GERER/RvZt29fvaqqcZ1vv96hzWUpirIAwAIA6NWrF/bu3RvkEhEREdG/E0VRUFlZGexiEBER0b8RRVHKurr9enePqgDQU/N/yrnbJFVVF6uqmqeqal5c3AUhExERERERERHRv4XrHdrsAdBXUZR0RVFMAO4FsOI6l4GIiIjooh599NFgF4GIiIgIwHXuHqWqqkdRlCcAfApAD+AVVVWLrmcZiIiIiC5l8eLFwS4CEREREYDr39IGqqquUVU1U1XVPqqqPne9X5+IiIjoUnJzc4NdBCIiIiIAQQhtiIiIiLqzgoKCYBeBiIiICABDGyIiIiIiIiKibomhDREREZFGYmJisItAREREBIChDREREVGAysrKYBeBiIiICABDGyIiIqIAixYtCnYRiIiIiAAwtCEiIiIK8Oyzzwa7CEREREQAGNoQEREREREREXVLDG2IiIiIiIiIiLohhjZEREREGnv37g12EYiIiIgAMLQhIiIiIiIiIuqWGNoQERERaeTl5QW7CEREREQAGNoQEREREREREXVLDG2IiIiIiIiIiLohhjZEREREGs8880ywi0BEREQEgKENERERUYBFixYFuwhEREREABjaEBEREQVISkoKdhGIiIiIADC0ISIiIgpQVVUV7CIQERERAWBoQ0RERERERETULTG0ISIiItLIyckJdhGIiIiIADC0ISIiIgqwb9++YBeBiIiICABDGyIiIqIACxYsCHYRiIiIiAAwtCEiIiIKsGTJkmAXgYiIiAgAQxsiIiIiIiIiom6JoQ0RERERERERUTfE0IaIiIhIo6KiIthFICIiIgLA0IaIiIgoAGePIiIiou6CoQ0RERGRxpw5c4JdBCIiIiIADG2IiIiIiIiIiLolhjZERERERERERN0QQxsiIiIijRdffDHYRSAiIiICwNCGiIiIKMCCBQuCXQQiIiIiAAxtiIiIiAIoihLsIhAREREBYGhDRERERERERNQtMbQhIiIiIiIiIuqGGNoQERERacyaNSvYRSAiIiICwNCGiIiIKMDKlSuDXQQiIiIiAAxtiIiIiALMnj072EUgIiIiAsDQhoiIiCjAqlWrgl0EIiIiIgAMbYiIiIiIiIiIuiWGNkRERERERERE3RBDGyIiIiINVVWDXQQiIiIiAAxtiIiIiAIsXrw42EUgIiIiAsDQhoiIiCjAY489FuwiEBEREQFgaENERERERERE1C0xtCEiIiIiIiIi6oYY2hARERFprFixIthFICIiIgLA0IaIiIgoQG5ubrCLQERERASAoQ0RERFRgOTk5GAXgYiIiAgAQxsiIiIiIiIiom6JoQ0RERERERERUTfE0IaIiIhI49FHHw12EYiIiIgAMLQhIiIiCrB48eJgF4GIiIgIAEMbIiIiogCcPYqIiIi6C4Y2RERERBoFBQXBLgIRERERAIY2RERERERERETdEkMbIiIiIo3ExMRgF4GIiIgIAEMbIiIiogCVlZXBLgIRERERAIY2RERERAEWLVoU7CIQERERAWBoQ0RERBTg2WefDXYRiIiIiAAwtCEiIiIiIiIi6pYY2hARERERERERdUMMbYiIiIg09u7dG+wiEBEREQFgaENERERERERE1C0xtCEiIiLSyMvLC3YRiIiIiAAwtCEiIiIiIiIi6pYY2hARERERERERdUMMbYiIiIg0nnnmmWAXgYiIiAgAQxsiIiKiAIsWLQp2EYiIiIgAMLQhIiIiCpCUlBTsIhAREREBYGhDREREFKCqqirYRSAiIiICwNCGiIiIiIiIiKhbYmhDREREpJGTkxPsIhAREREBYGhDREREFGDfvn3BLgIRERERAIY2RERERAEWLFgQ7CIQERERAWBoQ0RERBRgyZIlwS4CEREREQCGNkRERERERERE3RJDGyIiIiIiIiKiboihDREREZFGRUVFsItAREREBIChDREREVEAzh5FRERE3QVDGyIiIiKNOXPmBLsIRERERAAY2hARERERERERdUsMbYiIiIiIiIiIuiGGNkREREQaL774YrCLQERERASAoQ0RERFRgAULFgS7CEREREQAGNoQERERBVAUJdhFICIiIgLA0IaIiIiIiIiIqFtiaENERERERERE1A0xtCEiIiLSmDVrVrCLQERERASAoQ0RERFRgJUrVwa7CEREREQAGNoQERERBZg9e3awi0BEREQEgKENERERUYBVq1YFuwhEREREABjaEBERERERERF1SwxtiIiIiIiIiIi6IYY2RERERBqqqga7CEREREQAGNoQERERBVi8eHGwi0BEREQEgKENERERUYDHHnss2EUgIiIiAsDQhoiIiIiIiIioW2JoQ0RERERERETUDTG0ISIiItJYsWJFsItAREREBIChDREREVGA3NzcYBeBiIiICABDGyIiIqIAycnJwS4CEREREQCGNkRERERERERE3RJDGyIiIiIiIiKiboihDREREZHGo48+GuwiEBEREQFgaENEREQUYPHixcEuAhEREREAhjZEREREATh7FBEREXUXDG2IiIiINAoKCoJdBCIiIiIADG2IiIiIiIiIiLolhjZEREREGomJicEuAhEREREAhjZEREREASorK4NdBCIiIiIADG2IiIiIAixatCjYRSAiIiICwNCGiIiIKMCzzz4b7CIQERERAWBoQ0RERERERETULTG0ISIiIiIiIiLqhhjaEBEREWns3bs32EUgIiIiAsDQhoiIiIiIiIioW2JoQ0RERKSRl5cX7CIQERERAWBoQ0RERERERETULTG0ISIiIiIiIiLqhhjaEBEREWk888wzwS4CEREREQCGNkREREQBFi1aFOwiEBEREQFgaENEREQUICkpKdhFICIiIgLA0IaIiIgoQFVVVbCLQERERASAoQ0RERERERERUbfE0IaIiIhIIycnJ9hFICIiIgLA0IaIiIgowL59+4JdBCIiIiIADG2IiIiIAixYsCDYRSAiIiICwNCGiIiIKMCSJUuCXQQiIiIiAAxtiIiIiIiIiIi6JYY2RERERERERETdEEMbIiIiIo2KiopgF4GIiIgIAEMbIiIiogCcPYqIiIi6C4Y2RERERBpz5swJdhGIiIiIADC0ISIiIiIiIiLqlhjaEBERERERERF1QwxtiIiIiDRefPHFYBeBiIiICABDGyIiIqIACxYsCHYRiIiIiAAwtCEiIiIKoChKsItAREREBIChDRERERERERFRt8TQhoiIiIiIiIioG2JoQ0RERKQxa9asYBeBiIiICABDGyIiIqIAK1euDHYRiIiIiAAwtCEiIiIKMHv27GAXgYiIiAgAQxsiIiKiAKtWrQp2EYiIiIgAMLQhIiIiIiIiIuqWGNoQEREREREREXVDDG2IiIiINFRVDXYRiIiIiAAwtCEiIiIKsHjx4mAXgYiIiAgAQxsiIiKiAI899liwi0BEREQEgKENEREREREREVG
"text/plain": [
"<Figure size 1440x1920 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"image_path = \"../../example-docs/double-column-B.jpg\"\n",
"image = Image.open(image_path)\n",
"layout = DocumentLayout.from_image_file(image_path) # from_file for pdfs\n",
"width, height = image.size\n",
"elements = layout.pages[0].elements\n",
"elements_coordinates =[e.to_dict()['coordinates'] for e in elements]\n",
"elements_types = [f\"{ix}: {e.to_dict()['type']}\" for ix, e in enumerate(elements, start=1)]\n",
"plot_image_with_bounding_boxes_coloured(image_path, elements_coordinates, elements_types, desired_width=20)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "7733a268-a58f-478f-9c31-9a38a02983a3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABG0AAAXMCAYAAACbWXE4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOzdd3hUx7k/8O/ZvpJWvaACkkAgREcSvbvQwRj3bic2znVySXXi3BTjJE5unF9iJzfFgHvHDdNtuqlGgEwToquAulCXtu/5/bHMcFYIATZmRfL9PI8eSVvOmT3n7O7MO+/MKKqqgoiIiIiIiIiIuhZdsAtAREREREREREQXYtCGiIiIiIiIiKgLYtCGiIiIiIiIiKgLYtCGiIiIiIiIiKgLYtCGiIiIiIiIiKgLYtCGiIiIiIiIiKgLuuZBG0VRpiqKclRRlBOKojx1rfdPRERERERERHQ9UFRVvXY7UxQ9gGMAbgZwBsBuAPeoqnr4mhWCiIiIiIiIiOg6cK0zbYYDOKGq6ilVVV0A3gNwyzUuAxERERERERFRl2e4xvtLBnBa8/8ZACMu9uDY2Fg1LS3tmy4TERERkVReXo6kpKRgF4OIiIj+g+zdu7dWVdW49rdf66DNJSmKMg/APADo0aMH9uzZE+QSERER0X8SRVFQXl4e7GIQERHRfxBFUUo6uv1aD48qA9Bd83/KudskVVUXqaqaq6pqblzcBUEmIiIiIiIiIqL/CNc6aLMbQG9FUdIVRTEBuBvA8mtcBiIiIqKLeuyxx4JdBCIiIiIA13h4lKqqHkVRvgfgMwB6AK+oqlpwLctARERE1JlFixYFuwhEREREAK59pg1UVV2tqmofVVV7qar67LXePxEREVFncnJygl0EIiIiIgBBCNoQERERdWX5+fnBLgIRERERAAZtiIiIiIiIiIi6JAZtiIiIiDQSExODXQQiIiIiAAzaEBEREQUoLy8PdhGIiIiIADBoQ0RERBRgwYIFwS4CEREREQAGbYiIiIgCPPPMM8EuAhEREREABm2IiIiIiIiIiLokBm2IiIiIiIiIiLogBm2IiIiINPbs2RPsIhAREREBYNCGiIiIiIiIiKhLYtCGiIiISCM3NzfYRSAiIiICwKANEREREREREVGXxKANEREREREREVEXxKANERERkcbTTz8d7CIQERERAWDQhoiIiCjAggULgl0EIiIiIgAM2hAREREFSEpKCnYRiIiIiAAwaENEREQUoKKiIthFICIiIgLAoA0RERERERERUZfEoA0RERGRRnZ2drCLQERERASAQRsiIiKiAHv37g12EYiIiIgAMGhDREREFGDevHnBLgIRERERAAZtiIiIiAIsXrw42EUgIiIiAsCgDRERERERERFRl8SgDRERERERERFRF8SgDREREZFGWVlZsItAREREBIBBGyIiIqIAXD2KiIiIugoGbYiIiIg0Zs+eHewiEBEREQFg0IaIiIiIiIiIqEti0IaIiIiIiIiIqAti0IaIiIhIY+HChcEuAhEREREABm2IiIiIAsybNy/YRSAiIiICwKANERERUQBFUYJdBCIiIiIADNoQEREREREREXVJDNoQEREREREREXVBDNoQERERacycOTPYRSAiIiICwKANERERUYAVK1YEuwhEREREABi0ISIiIgowa9asYBeBiIiICACDNkREREQBVq5cGewiEBEREQFg0IaIiIiIiIiIqEti0IaIiIiIiIiIqAti0IaIiIhIQ1XVYBeBiIiICACDNkREREQBFi1aFOwiEBEREQFg0IaIiIgowOOPPx7sIhAREREBYNCGiIiIiIiIiKhLYtCGiIiIiIiIiKgLYtCGiIiISGP58uXBLgIRERERAAZtiIiIiALk5OQEuwhEREREABi0ISIiIgqQnJwc7CIQERERAWDQhoiIiIiIiIioS2LQhoiIiIiIiIioC2LQhoiIiEjjscceC3YRiIiIiAAwaENEREQUYNGiRcEuAhEREREABm2IiIiIAnD1KCIiIuoqGLQhIiIi0sjPzw92EYiIiIgAMGhDRERERERERNQlMWhDREREpJGYmBjsIhAREREBYNCGiIiIKEB5eXmwi0BEREQEgEEbIiIiogALFiwIdhGIiIiIADBoQ0RERBTgmWeeCXYRiIiIiAAwaENERERERERE1CUxaENERERERERE1AUxaENERESksWfPnmAXgYiIiAgAgzZERERERERERF0SgzZEREREGrm5ucEuAhEREREABm2IiIiIiIiIiLokBm2IiIiIiIiIiLogBm2IiIiINJ5++ulgF4GIiIgIAIM2RERERAEWLFgQ7CIQERERAWDQhoiIiChAUlJSsItAREREBIBBGyIiIqIAFRUVwS4CEREREQAGbYiIiIiIiIiIuiQGbYiIiIg0srOzg10EIiIiIgAM2hAREREF2Lt3b7CLQERERASAQRsiIiKiAPPmzQt2EYiIiIgAMGhDREREFGDx4sXBLgIRERERAAZtiIiIiIiIiIi6JAZtiIiIiIiIiIi6IAZtiIiIiDTKysqCXQQiIiIiAAzaEBEREQXg6lFERETUVTBoQ0RERKQxe/bsYBeBiIiICACDNkREREREREREXRKDNkREREREREREXRCDNkREREQaCxcuDHYRiIiIiAAwaENEREQUYN68ecEuAhEREREABm2IiIiIAiiKEuwiEBEREQFg0IaIiIiIiIiIqEti0IaIiIiIiIiIqAti0IaIiIhIY+bMmcEuAhEREREABm2IiIiIAqxYsSLYRSAiIiICwKANERERUYBZs2YFuwhEREREABi0ISIiIgqwcuXKYBeBiIiICACDNkREREREREREXRKDNkREREREREREXRCDNkREREQaqqoGuwhEREREABi0ISIiIgqwaNGiYBeBiIiICACDNkREREQBHn/88WAXgYiIiAgAgzZERERERERERF0SgzZERERERERERF0QgzZEREREGsuXLw92EYiIiIgAMGhDREREFCAnJyfYRSAiIiICwKANERERUYDk5ORgF4GIiIgIAIM2RERERERERERdEoM2RERERERERERdEIM2RERERBqPPfZYsItAREREBIBBGyIiIqIAixYtCnYRiIiIiAAwaENEREQUgKtHERERUVfBoA0RERGRRn5+frCLQERERASAQRsiIiIiIiIioi6JQRsiIiIijcTExGAXgYiIiAgAgzZEREREAcrLy4NdBCIiIiIADNoQERERBViwYEGwi0BEREQEgEEbIiIiogDPPPNMsItAREREBIBBGyIiIiIiIiKiLolBGyIiIiIiIiKiLohBGyIiIiKNPXv2BLsIRERERAAYtCEiIiIiIiIi6pIYtCEiIiLSyM3NDXYRiIiIiAAwaENERERERERE1CUxaENERERERERE1AUxaENERESk8fTTTwe7CEREREQAGLQhIiIiCrBgwYJgF4GIiIgIAIM2RERERAGSkpKCXQQiIiIiAAzaEBEREQWoqKgIdhGIiIiIADBoQ0RERERERETUJTFoQ0RERKSRnZ0d7CIQERERAWDQhoiIiCjA3r17g10EIiIiIgAM2hAREREFmDdvXrCLQERERASAQRsiIiKiAIsXLw52EYiIiIgAMGhDRERERERERNQlMWhDRERERERERNQFMWhDREREpFFWVhbsIhAREREBYNCGiIiIKABXjyIiIqKugkEbIiIiIo3Zs2cHuwhEREREABi0ISIiIiIiIiLqkhi0ISIiIiIiIiLqghi0ISIiItJYuHBhsItAREREBIBBGyIiIqIA8+bNC3YRiIiIiAAwaENEREQUQFGUYBeBiIiICACDNkREREREREREXRKDNkREREREREREXRCDNkREREQaM2fODHYRiIiIiAAwaENEREQUYMWKFcEuAhEREREABm2IiIiIAsyaNSvYRSAiIiICwKANERERUYCVK1cGuwhEREREABi0ISIiIiIiIiLqkhi0ISIiIiIiIiLqghi0ISIiItJQVTXYRSAiIiICwKANERERUYBFixYFuwhEREREABi0ISIiIgrw+OOPB7s
"text/plain": [
"<Figure size 1440x1920 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"new_ixs = reorder_elements_in_double_columns(image_path, elements_coordinates)\n",
"elements_reord = [elements[i] for i in new_ixs]\n",
"elements_coordinates_fix =[e.to_dict()['coordinates'] for e in elements_reord]\n",
"elements_types_fix = [f\"{ix}: {e.to_dict()['type']}\" for ix, e in enumerate(elements_reord, start=1)]\n",
"plot_image_with_bounding_boxes_coloured(image_path, elements_coordinates_fix, elements_types_fix, desired_width=20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}