diff --git a/CHANGELOG.md b/CHANGELOG.md
index c405be937..f2c0e0139 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@
 ### Features
 
 * **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation.
-* **MongoDB Destination Connector** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
+* **MongoDB Destination Connector.** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
 
 ### Fixes
 
diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py
index 8053d12ba..92cbba2f1 100644
--- a/test_unstructured/metrics/test_evaluate.py
+++ b/test_unstructured/metrics/test_evaluate.py
@@ -1,6 +1,7 @@
 import os
 import pathlib
 
+import pandas as pd
 import pytest
 
 from unstructured.metrics.evaluate import (
@@ -34,3 +35,15 @@ def test_text_extraction_takes_list():
     with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
         lines = f.read().splitlines()
     assert len(lines) == len(output_list) + 1  # includes header
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_text_extraction_grouping():
+    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
+    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
+    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
+    measure_text_edit_distance(
+        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
+    )
+    df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
+    assert len(df) == 4
diff --git a/test_unstructured_ingest/check-diff-evaluation-metrics.sh b/test_unstructured_ingest/check-diff-evaluation-metrics.sh
index 90d6555ac..1c18082be 100755
--- a/test_unstructured_ingest/check-diff-evaluation-metrics.sh
+++ b/test_unstructured_ingest/check-diff-evaluation-metrics.sh
@@ -8,7 +8,7 @@
 # Environment Variables:
 #   - OVERWRITE_FIXTURES: Controls whether to overwrite fixtures or not. default: "false"
 
-set -e
+set +e
 
 SCRIPT_DIR=$(dirname "$(realpath "$0")")
 OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
@@ -45,10 +45,9 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
     # force copy (overwrite) files from metrics-tmp (new eval metrics) to metrics (old eval metrics)
     mkdir -p "$METRICS_DIR"
     cp -rf "$TMP_METRICS_LATEST_RUN_DIR" "$OUTPUT_ROOT/metrics"
-elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then
+elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then  
     "$SCRIPT_DIR"/clean-permissions-files.sh "$TMP_METRICS_LATEST_RUN_DIR"
-    diff -r "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
-    cat metricsdiff.txt
+    diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
     diffstat -c metricsdiff.txt
     echo
     echo "There are differences from the previously checked-in structured outputs."
diff --git a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv
index 9139c72b7..526bd6f08 100644
--- a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv
+++ b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv
@@ -1,2 +1,2 @@
-strategy	average	sample_sd	population_sd	count
-element-type-accuracy	0.814	0.108	0.077	2
+metric	average	sample_sd	population_sd	count
+element-type-accuracy	0.814	0.108	0.077	2
diff --git a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv
index 4d141d4f7..aa368f746 100644
--- a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv
+++ b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv
@@ -1,3 +1,3 @@
-filename	doctype	connector	element-type-accuracy
-IRS-form-1987.pdf	pdf	azure	0.89
-page-with-formula.pdf	pdf	s3	0.737
+filename	doctype	connector	element-type-accuracy
+IRS-form-1987.pdf	pdf	azure	0.89
+page-with-formula.pdf	pdf	s3	0.737
diff --git a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
index bc8003243..8865cec2c 100644
--- a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
@@ -1,3 +1,3 @@
-strategy	average	sample_sd	population_sd	count
-cct-accuracy	0.792	0.253	0.245	15
-cct-%missing	0.025	0.034	0.033	15
+metric	average	sample_sd	population_sd	count
+cct-accuracy	0.803	0.249	0.241	16
+cct-%missing	0.024	0.033	0.032	16
diff --git a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
index 6a5dc96cf..9858ebfdd 100644
--- a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
@@ -1,16 +1,17 @@
-filename	doctype	connector	cct-accuracy	cct-%missing
-Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf	pdf	azure	0.981	0.007
-IRS-form-1987.pdf	pdf	azure	0.783	0.135
-spring-weather.html	html	azure	0.0	0.018
-fake-text.txt	txt	Sharepoint	1.0	0.0
-stanley-cups.xlsx	xlsx	Sharepoint	0.778	0.0
-ideas-page.html	html	Sharepoint	0.929	0.033
-UDHR_first_article_all.txt	txt	local-single-file	0.995	0.0
-example-10k.html	html	local	0.686	0.037
-ideas-page.html	html	local	0.929	0.033
-fake-html-cp1252.html	html	local	0.659	0.0
-fake-html-cp1252.html	html	local-single-file-with-encoding	0.659	0.0
-layout-parser-paper.pdf	pdf	local-single-file-with-pdf-infer-table-structure	0.945	0.029
-layout-parser-paper-with-table.jpg	jpg	local-single-file-with-pdf-infer-table-structure	0.716	0.032
-2023-Jan-economic-outlook.pdf	pdf	s3	0.846	0.039
-recalibrating-risk-report.pdf	pdf	s3	0.973	0.007
+filename	doctype	connector	cct-accuracy	cct-%missing
+fake-text.txt	txt	Sharepoint	1.0	0.0
+ideas-page.html	html	Sharepoint	0.929	0.033
+stanley-cups.xlsx	xlsx	Sharepoint	0.778	0.0
+Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf	pdf	azure	0.981	0.007
+IRS-form-1987.pdf	pdf	azure	0.783	0.135
+spring-weather.html	html	azure	0.0	0.018
+example-10k.html	html	local	0.686	0.037
+fake-html-cp1252.html	html	local	0.659	0.0
+ideas-page.html	html	local	0.929	0.033
+UDHR_first_article_all.txt	txt	local-single-file	0.995	0.0
+fake-html-cp1252.html	html	local-single-file-with-encoding	0.659	0.0
+layout-parser-paper-with-table.jpg	jpg	local-single-file-with-pdf-infer-table-structure	0.716	0.032
+layout-parser-paper.pdf	pdf	local-single-file-with-pdf-infer-table-structure	0.945	0.029
+2023-Jan-economic-outlook.pdf	pdf	s3	0.846	0.039
+page-with-formula.pdf	pdf	s3	0.971	0.021
+recalibrating-risk-report.pdf	pdf	s3	0.973	0.007
diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py
index 7b055cf4f..a3e5dfe0a 100755
--- a/unstructured/ingest/evaluate.py
+++ b/unstructured/ingest/evaluate.py
@@ -36,6 +36,7 @@ def main():
     help="Directory to save the output evaluation metrics to. Default to \
         your/working/dir/metrics/",
 )
+@click.option("--grouping", type=str, help="Input field for aggregration, or leave blank if none.")
 @click.option(
     "--weights",
     type=(int, int, int),
@@ -50,10 +51,11 @@ def measure_text_edit_distance_command(
     output_list: Optional[List[str]],
     source_list: Optional[List[str]],
     export_dir: str,
+    grouping: Optional[str],
     weights: Tuple[int, int, int],
 ):
     return measure_text_edit_distance(
-        output_dir, source_dir, output_list, source_list, export_dir, weights
+        output_dir, source_dir, output_list, source_list, export_dir, grouping, weights
     )
 
 
diff --git a/unstructured/metrics/element_type.py b/unstructured/metrics/element_type.py
index 6c77d6eb1..3e4e8cbf8 100644
--- a/unstructured/metrics/element_type.py
+++ b/unstructured/metrics/element_type.py
@@ -7,6 +7,11 @@ def get_element_type_frequency(
 ) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]:
     """
     Calculate the frequency of Element Types from a list of elements.
+
+    Args:
+        elements (str): String-formatted json of all elements (as a result of elements_to_json).
+    Returns:
+        Element type and its frequency in dictionary format.
     """
     frequency: Dict = {}
     if len(elements) == 0:
diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py
index c129cfb4e..79095fe60 100755
--- a/unstructured/metrics/evaluate.py
+++ b/unstructured/metrics/evaluate.py
@@ -1,13 +1,13 @@
 #! /usr/bin/env python3
 
-import csv
 import logging
 import os
 import statistics
 import sys
-from typing import Any, List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import click
+import pandas as pd
 
 from unstructured.metrics.element_type import (
     calculate_element_type_percent_match,
@@ -29,7 +29,7 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
 logger.setLevel(logging.DEBUG)
 
 
-agg_headers = ["strategy", "average", "sample_sd", "population_sd", "count"]
+agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]
 
 
 def measure_text_edit_distance(
@@ -38,6 +38,7 @@ def measure_text_edit_distance(
     output_list: Optional[List[str]] = None,
     source_list: Optional[List[str]] = None,
     export_dir: str = "metrics",
+    grouping: Optional[str] = None,
     weights: Tuple[int, int, int] = (2, 1, 1),
 ) -> None:
     """
@@ -58,50 +59,57 @@ def measure_text_edit_distance(
         sys.exit(0)
 
     rows = []
-    accuracy_scores: List[float] = []
-    percent_missing_scores: List[float] = []
 
     # assumption: output file name convention is name-of-file.doc.json
     for doc in output_list:  # type: ignore
-        fn = (doc.split("/")[-1]).split(".json")[0]
-        doctype = fn.rsplit(".", 1)[-1]
-        fn_txt = fn + ".txt"
+        filename = (doc.split("/")[-1]).split(".json")[0]
+        doctype = filename.rsplit(".", 1)[-1]
+        fn_txt = filename + ".txt"
         connector = doc.split("/")[0]
 
+        # not all odetta cct files follow the same naming convention;
+        # some exclude the original filetype from the name
+        if fn_txt not in source_list:
+            fn = filename.rsplit(".", 1)[0]
+            fn_txt = fn + ".txt"
+
         if fn_txt in source_list:  # type: ignore
             output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
             source_cct = _read_text(os.path.join(source_dir, fn_txt))
             accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
             percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
 
-            rows.append([fn, doctype, connector, accuracy, percent_missing])
-            accuracy_scores.append(accuracy)
-            percent_missing_scores.append(percent_missing)
+            rows.append([filename, doctype, connector, accuracy, percent_missing])
 
     headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
-    _write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
+    df = pd.DataFrame(rows, columns=headers)
+    export_filename = "all-docs-cct"
 
-    agg_rows = []
-    agg_rows.append(
-        [
-            "cct-accuracy",
-            _mean(accuracy_scores),
-            _stdev(accuracy_scores),
-            _pstdev(accuracy_scores),
-            len(accuracy_scores),
-        ],
-    )
-    agg_rows.append(
-        [
-            "cct-%missing",
-            _mean(percent_missing_scores),
-            _stdev(percent_missing_scores),
-            _pstdev(percent_missing_scores),
-            len(percent_missing_scores),
-        ],
-    )
-    _write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_rows, agg_headers)
-    _display(agg_rows, agg_headers)
+    acc = df[["cct-accuracy"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
+    miss = df[["cct-%missing"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
+    agg_df = pd.concat((acc, miss)).reset_index()
+    agg_df.columns = agg_headers
+
+    if grouping:
+        if grouping in ["doctype", "connector"]:
+            grouped_acc = (
+                df.groupby(grouping)
+                .agg({"cct-accuracy": [_mean, _stdev, "count"]})
+                .rename(columns={"_mean": "mean", "_stdev": "stdev"})
+            )
+            grouped_miss = (
+                df.groupby(grouping)
+                .agg({"cct-%missing": [_mean, _stdev, "count"]})
+                .rename(columns={"_mean": "mean", "_stdev": "stdev"})
+            )
+            df = _format_grouping_output(grouped_acc, grouped_miss)
+            export_filename = f"all-{grouping}-agg-cct"
+        else:
+            print("No field to group by. Returning a non-group evaluation.")
+
+    _write_to_file(export_dir, f"{export_filename}.tsv", df)
+    _write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_df)
+    _display(agg_df)
 
 
 def measure_element_type_accuracy(
@@ -124,40 +132,31 @@ def measure_element_type_accuracy(
     if not source_list:
         source_list = _listdir_recursive(source_dir)
 
-    if not output_list:
-        print("No output files to calculate to element type for, exiting")
-        sys.exit(0)
-
     rows = []
-    accuracy_scores: List[float] = []
 
     for doc in output_list:  # type: ignore
-        fn = (doc.split("/")[-1]).split(".json")[0]
-        doctype = fn.rsplit(".", 1)[-1]
-        fn_json = fn + ".json"
+        filename = (doc.split("/")[-1]).split(".json")[0]
+        doctype = filename.rsplit(".", 1)[-1]
+        fn_json = filename + ".json"
         connector = doc.split("/")[0]
         if fn_json in source_list:  # type: ignore
             output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
             source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json)))
             accuracy = round(calculate_element_type_percent_match(output, source), 3)
-            rows.append([fn, doctype, connector, accuracy])
-            accuracy_scores.append(accuracy)
+            rows.append([filename, doctype, connector, accuracy])
 
     headers = ["filename", "doctype", "connector", "element-type-accuracy"]
-    _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
+    df = pd.DataFrame(rows, columns=headers)
+    if df.empty:
+        agg_df = pd.DataFrame(["element-type-accuracy", None, None, None, 0]).transpose()
+    else:
+        agg_df = df.agg({"element-type-accuracy": [_mean, _stdev, _pstdev, "count"]}).transpose()
+        agg_df = agg_df.reset_index()
+    agg_df.columns = agg_headers
 
-    agg_rows = []
-    agg_rows.append(
-        [
-            "element-type-accuracy",
-            _mean(accuracy_scores),
-            _stdev(accuracy_scores),
-            _pstdev(accuracy_scores),
-            len(accuracy_scores),
-        ],
-    )
-    _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, agg_headers)
-    _display(agg_rows, agg_headers)
+    _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df)
+    _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df)
+    _display(agg_df)
 
 
 def _listdir_recursive(dir: str):
@@ -173,13 +172,20 @@ def _listdir_recursive(dir: str):
     return listdir
 
 
-def _display(rows, headers):
+def _format_grouping_output(*df):
+    return pd.concat(df, axis=1).reset_index()
+
+
+def _display(df):
+    if len(df) == 0:
+        return
+    headers = df.columns.tolist()
     col_widths = [
-        max(len(headers[i]), max(len(str(row[i])) for row in rows)) for i in range(len(headers))
+        max(len(header), max(len(str(item)) for item in df[header])) for header in headers
     ]
-    click.echo(" ".join(headers[i].ljust(col_widths[i]) for i in range(len(headers))))
+    click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers)))
     click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1))
-    for row in rows:
+    for _, row in df.iterrows():
         formatted_row = []
         for item in row:
             if isinstance(item, float):
@@ -191,31 +197,31 @@ def _display(rows, headers):
         )
 
 
-def _write_to_file(dir: str, filename: str, rows: List[Any], headers: List[Any], mode: str = "w"):
+def _write_to_file(dir: str, filename: str, df: pd.DataFrame, mode: str = "w"):
     if mode not in ["w", "a"]:
         raise ValueError("Mode not supported. Mode must be one of [w, a].")
     if dir and not os.path.exists(dir):
         os.makedirs(dir)
-    with open(os.path.join(os.path.join(dir, filename)), mode, newline="") as tsv:
-        writer = csv.writer(tsv, delimiter="\t")
-        if mode == "w":
-            writer.writerow(headers)
-        writer.writerows(rows)
+    if "count" in df.columns:
+        df["count"] = df["count"].astype(int)
+    if "filename" in df.columns and "connector" in df.columns:
+        df.sort_values(by=["connector", "filename"], inplace=True)
+    df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w"))
 
 
-def _mean(scores: List[float], rounding: Optional[int] = 3):
-    if len(scores) < 1:
+def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3):
+    if len(scores) == 0:
         return None
-    elif len(scores) == 1:
-        mean = scores[0]
-    else:
-        mean = statistics.mean(scores)
+    mean = statistics.mean(scores)
     if not rounding:
         return mean
     return round(mean, rounding)
 
 
-def _stdev(scores: List[float], rounding: Optional[int] = 3):
+def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
+    # Filter out None values
+    scores = [score for score in scores if score is not None]
+    # Proceed only if there are more than one value
     if len(scores) <= 1:
         return None
     if not rounding:
@@ -223,7 +229,8 @@ def _stdev(scores: List[float], rounding: Optional[int] = 3):
     return round(statistics.stdev(scores), rounding)
 
 
-def _pstdev(scores: List[float], rounding: Optional[int] = 3):
+def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
+    scores = [score for score in scores if score is not None]
     if len(scores) <= 1:
         return None
     if not rounding: