From e65a44eabbcb1597e7a4b46e4d61f083503632f8 Mon Sep 17 00:00:00 2001 From: Klaijan Date: Fri, 5 Jan 2024 18:34:53 -0500 Subject: [PATCH] feat: update cct eval for text dir (#2299) The code makes edit to the `measure_text_extraction_accuracy` function to allows dir of txt as well as json. The function also takes input `output_type` to be either "json" or "txt" only, and checks if the files under given directory/list contains only specified file type or not. To test this feature, run the following code: ```PYTHONPATH=. python unstructured/ingest/evaluate.py measure-text-extraction-accuracy-command --output_dir --source_dir --output_type txt``` --- CHANGELOG.md | 2 +- .../Bank Good Credit Loan.pptx.txt | 25 + .../IRS-form-1987.pdf.txt | 82 ++ .../Performance-Audit-Discussion.pdf.txt | 55 ++ .../unstructured_output_cct/currency.csv.txt | 823 ++++++++++++++++++ test_unstructured/metrics/test_evaluate.py | 29 + test_unstructured/metrics/test_utils.py | 35 + unstructured/ingest/evaluate.py | 18 +- unstructured/metrics/evaluate.py | 137 +-- unstructured/metrics/utils.py | 222 +++++ 10 files changed, 1327 insertions(+), 101 deletions(-) create mode 100644 example-docs/test_evaluate_files/unstructured_output_cct/Bank Good Credit Loan.pptx.txt create mode 100644 example-docs/test_evaluate_files/unstructured_output_cct/IRS-form-1987.pdf.txt create mode 100644 example-docs/test_evaluate_files/unstructured_output_cct/Performance-Audit-Discussion.pdf.txt create mode 100644 example-docs/test_evaluate_files/unstructured_output_cct/currency.csv.txt create mode 100644 test_unstructured/metrics/test_utils.py create mode 100644 unstructured/metrics/utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bbe18e54..05d570e6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ### Enhancements -* **Rename kwargs related to extracting image blocks.** Rename the kwargs related to extracting image blocks for consistency and API usage. +* **Rename kwargs related to extracting image blocks** Rename the kwargs related to extracting image blocks for consistency and API usage. ### Features diff --git a/example-docs/test_evaluate_files/unstructured_output_cct/Bank Good Credit Loan.pptx.txt b/example-docs/test_evaluate_files/unstructured_output_cct/Bank Good Credit Loan.pptx.txt new file mode 100644 index 000000000..98f5f4439 --- /dev/null +++ b/example-docs/test_evaluate_files/unstructured_output_cct/Bank Good Credit Loan.pptx.txt @@ -0,0 +1,25 @@ +Bank Good Credit +Accredited with IABAC™ +( International Association of Business Analytics Certifications)` +© DataMites™. All Rights Reserved | www.datamites.com +Objective & Background +Classify credit card customers as good / bad, based on information from internal and external sources. +Data provided +Demographic: Base file of with credit card history details. Only one record for every customer. +Account: Contians data for various loans availed by the customer. Not related to credit card. Multiple records for every customer. +Enquiries: Enquired made by customers for different loan purposes. Multiple records for every customer. +© DataMites™. All Rights Reserved | www.datamites.com +Design +Data to be downloaded using SQL queries. +Required information to be extracted from Account and Enquiry files and converted to one-to-one files. +The columns from the two files should be merged with Demographic file using Left Join with “customer no” as key column, to create a final file. The final file should contain all the records in demographic and additional columns/features from Account and Enquiry files will get added to Demographic file. +There will be many customers in account and enquiry file who will get left out. This is fine as we anyway don’t know their good/bad label for training purpose. +© DataMites™. All Rights Reserved | www.datamites.com +Analysis of Data +Show using Excel File +© DataMites™. All Rights Reserved | www.datamites.com +Explain Coding / outcomes +Show using Jupyter +© DataMites™. All Rights Reserved | www.datamites.com +Thank You +© DataMites™. All Rights Reserved | www.datamites.com \ No newline at end of file diff --git a/example-docs/test_evaluate_files/unstructured_output_cct/IRS-form-1987.pdf.txt b/example-docs/test_evaluate_files/unstructured_output_cct/IRS-form-1987.pdf.txt new file mode 100644 index 000000000..5c3ba96b2 --- /dev/null +++ b/example-docs/test_evaluate_files/unstructured_output_cct/IRS-form-1987.pdf.txt @@ -0,0 +1,82 @@ +a +Department of the Treasury Internal Revenue Service +Instructions for Form 3115 +(Rev. November 1987) +Application for Change in Accounting Method +(Section references are to the Internal Revenue Code unless otherwise noted.) +Paperwork Reduction Act Notice +We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws and to allow us to figure and collect the right amount of tax. You are required to give us this information. +General Instructions +Purpose of Form +File this form to request a change in your accounting method, including the accounting treatment of any item. If you are requesting a change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods. +When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current revision date of Form 3115). +Generally, applicants must complete Section A. In addition, complete the appropriate sections (B-1 through H) for which a change Is desired. +You must give all relevant facts, including a detailed description of your present and proposed methods. You must also state the reason(s) you believe approval to make the requested change should be granted. Attach additional pages if more space is needed for explanations. Each page should show your name, address, and identifying number. +State whether you desire a conference in the National Office if the Service proposes to disapprove your application. +Changes to Accounting Methods Required Under the Tax Reform Act of 1986 +Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required. +Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method—Section 448”). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information. +Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. +Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes. +Time and Place for Filing +Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change. +Taxpayers, other than exempt organizations, should file Form 3115 with the Commissioner of Internal Revenue, Attention: CC:C:4, 1111 Constitution Avenue, NW, Washington, DC 20224, Exempt organizations should file with the Assistant Commissioner (Employee Plans and Exempt Organizations), 1111 Constitution Avenue, NW, Washington, DC 20224. +You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224. +See section 5.03 of Rev. Proc. 84-74 for filing an early application. Note: /f this form is being filed in accordance with Rev. Proc. 74-11, see Section G below. +Late Applications +If your application is filed after the 180-day period, it 1s late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63. +Identifying Number +Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both. Others.-—The employer identification number of +an applicant other than an individual should be entered in this block. +Signature +Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. +Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” +Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. +Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. +Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. +If the individual or firm is also authorized to represent the applicant before the IRS, receive a copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s). +Affiliated Groups +Taxpayers that are members of an affiliated group filing a consolidated return that seeks to change to the same accounting method for more than one member of the group must file a separate Form 3115 for each such member, +Specific Instructions Section A +Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a). +Item 6, page 2.—The term “gross receipts” includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority. +Item 7b, page 2.—If item 7b 1s “Yes,” indicate on a separate sheet the following for each separate trade or business: Nature of business +(manufacturing, retailer, wholesaler, etc.), employer identification number, overall method of accounting, and whether, in the last 6 years, that business has changed its accounting method, or is also changing its accounting method as part of this request or as a separate request. +Item 11, page 2.—If you cannot provide the requested information, you may sign a statement under penalties of perjury that: +(1) Gives your best estimate of the percentage of the section 481(a) adjustment that would have been required if the requested change had been made for each of the 3 preceding years; and +(2) Explains in detail why you cannot provide the requested information. +See section 5.06(2) of Rev. Proc. 84-74 for the required perjury statement that must be attached. +If IRS later examines your return for the year of the change or for later years, it has the right to verify your statement at that time. +Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” is not acceptable. However, “more than 6 years” Is acceptable. +Section B-1 +Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application. +Sections B-2 and B-3 +Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities. +The limitation on the use of the cash method (except for tax shelters) does not apply to— +(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method. +(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law, engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b) +Page 2 +substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death. +(3) Entities with gross receipts of $5,000,000 or less.—To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period. +For more information, see section 448 and Temporary Regulations section 1.448-1T. +Section C +Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change. +Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8. +LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information: +(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market). +(2) State whether the proposed identification and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726. +(3) The termination event statement required by section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event. +Section D +Applicants requesting to change their method of valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods. +% U.S. Government Printing Office: 1987—201-993/60166 +Section E +Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete. +All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. +Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts. +Section G +This section Is to be used only to request a change in a method of accounting for depreciation under section 167. +Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change. +Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually). +Section H +Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested. +lf you are making an election under section 458, show the applicable information under Regulations section 1.458-10. \ No newline at end of file diff --git a/example-docs/test_evaluate_files/unstructured_output_cct/Performance-Audit-Discussion.pdf.txt b/example-docs/test_evaluate_files/unstructured_output_cct/Performance-Audit-Discussion.pdf.txt new file mode 100644 index 000000000..078aba2bf --- /dev/null +++ b/example-docs/test_evaluate_files/unstructured_output_cct/Performance-Audit-Discussion.pdf.txt @@ -0,0 +1,55 @@ +GAGAS Performance Audits: Discussion of Concepts to Consider When Auditing Public Functions and Services +The introductory chapter of Government Auditing Standards (GAGAS)1 outlines five concepts describing how public officials are to provide functions and services: effectively, efficiently, economically, ethically, and equitably. When planning, gathering and assessing evidence, and reporting audit results, auditors may focus on one or more of these concepts. The following discussion is intended to assist auditors when developing audit objectives for performance audits of government programs and activities.2 +This discussion is designed to help auditors understand and apply the concepts cited above for performance audits conducted in accordance with GAGAS. This discussion does not contain requirements, does not amend GAGAS, and is not considered interpretive guidance, as defined in chapter 2 of GAGAS. +GAGAS Paragraphs +Paragraph 1.02: +The concept of accountability for use of public resources and government authority is key to our nation’s governing processes. Management and officials entrusted with public resources are responsible for carrying out public functions and providing service to the public effectively, efficiently, economically, ethically, and equitably within the context of the statutory boundaries of the specific government program. [Emphasis added.] +Paragraph 1.03: +As reflected in applicable laws, regulations, agreements, and standards, management and officials of government programs are responsible for providing reliable, useful, and timely information for transparency and accountability of these programs and their operations. Legislators, oversight +1GAO, Government Auditing Standards: 2018 Revision, GAO-21-368G (Washington, D.C.: April 2021) +2The concepts cited may also be applicable to other GAGAS engagements, based on the auditors’ judgments. This discussion is limited to considering these concepts in performance audits. +Page 1 +Discussion +Effective +bodies, those charged with governance, and the public need to know whether (1) management and officials manage government resources and use their authority properly and in compliance with laws and regulations; (2) government programs are achieving their objectives and desired outcomes; and (3) government services are provided effectively, efficiently, economically, ethically, and equitably. [Emphasis added.] +Government administration best serves the collective interest of the public when it is effective, efficient, economical, ethical, and equitable. Auditors help inform legislators, oversight bodies, those charged with governance, and the public about whether public services are being provided consistent with these concepts. Government auditing can contribute to accountability and can help improve government administration by identifying deficiencies and recommending enhancements to achieve effective, efficient, economical, ethical, and equitable outcomes, when appropriate within the context of the audit objectives. As such, it is important for auditors to understand the concepts below as they relate to administering government programs or activities and how they can assess or address these expectations of government performance in conducting their performance audits. +The examples that follow the discussion of each concept illustrate the distinctions between these concepts. In a performance audit, it is common practice to incorporate more than one of these concepts when conducting the audit. +The administration of a government program or activity is effective when it achieves the intended results. A performance audit that focuses on the effectiveness of a program or activity seeks to establish a cause-and- effect relationship between the operation of the program or activity and achieving its stated objectives. Achieving the objectives does not guarantee that the program or activity was effective unless the auditors can establish that the program or activity caused, or contributed to, the desired outcome. +Example: In a performance audit examining how effective a housing voucher program was in achieving its goal of improving economic outcomes for recipients, auditors may determine whether receiving housing vouchers led to better subsequent economic outcomes for recipients than those of similarly situated individuals who did not receive vouchers. +Page 2 +Efficient +Economical +Example: In a performance audit assessing the effectiveness of an after-school program targeted at helping students improve their reading proficiency, auditors may examine the extent to which participants’ reading levels improved relative to baseline data from before they joined the program. +The administration of a government program or activity is efficient when it gets the most value from available resources. When a performance audit focuses on efficiency, auditors examine whether the resources used to administer a program or activity have been put to optimal or satisfactory use, or whether the same or similar results could have been achieved more timely or with fewer resources. +Example: In a performance audit assessing a disaster relief agency’s mobilization of resources to respond to a disaster, auditors may assess the disaster relief agency’s timeliness in providing relief compared to its own previous performance or the performance of other similarly situated agencies that have responded to comparable disasters. +Example: In a performance audit assessing a consumer protection agency’s response to consumer complaints, auditors may assess whether the agency’s efforts to streamline its processes resulted in improved timely resolution of complaints. +Example: In a performance audit assessing the time a state needs to process unemployment benefits targeted at helping those in need, auditors may assess how long the process takes from receipt of the unemployment application to the applicant’s receipt of the benefit, including steps such as verifying required information. +The administration of a government program or activity is economical when it minimizes the costs of resources used in performing its functions while meeting timeliness and quality considerations for those resources. When auditing economy, auditors primarily focus on the costs of inputs rather than on the outcomes achieved. +Example: In a performance audit examining an agency’s international travel expenses, in addition to assessing the design of internal controls and compliance with expense guidelines, auditors may test whether, for a sample of trips, bookings of +Page 3 +Ethical +Equitable +equivalent airline tickets and hotel rooms could be found at a lower cost. +Example: In a performance audit assessing an agency’s acquisition practices, auditors may examine whether the agency’s decisions regarding purchasing, leasing, or reimbursing employees for the costs of acquiring various supplies or equipment achieved the lowest cost while meeting applicable requirements. +The administration of a government program or activity is ethical when it advances the collective interest of the public rather than private gain and is conducted with honesty, integrity, and impartiality. Laws and regulations often specify rules of ethical conduct. Therefore, audits examining the ethical administration of a program or activity may involve assessing compliance with such laws and regulations. Fraud in administering a government program or activity betrays the public trust and is, by definition, unethical. In addition, auditors may identify instances of unethical conduct that result in waste and abuse during testing of internal controls as part of a performance audit. +Example: In a performance audit assessing agency officials’ compliance with conflict-of-interest requirements, auditors may compare a sample of financial disclosure reports filed against requirements in statute or regulation. +Example: In a performance audit assessing potential regulatory capture related to a particular industry, auditors may assess the extent to which the regulatory agency has sufficient controls to reasonably assure its employees’ independence from the entities subject to the agency’s regulation. +Example: In a performance audit assessing an office’s policies and procedures for purchase cards, auditors’ testing of the program’s controls to identify deficiencies may identify fraud, waste, or abuse in its administration. +The administration of a government program or activity is equitable when it consistently serves members of the public, distributes public services, and implements public policy in a manner that promotes fairness, justice, and equality. Auditing whether the administration of a government program or activity is equitable may include assessing the +Page 4 +equality of access to and provision of services; +procedural fairness and equal treatment of individuals in government programs and policies; +causes of disparate outcomes; +or distributional impacts of public policies, programs, resources, and services. +Disaggregating data by social groups or communities that share a particular characteristic (e.g., gender, race, ethnicity, age, or income) can help illuminate differences. Reporting on such differences, when appropriate within the context of the audit objectives, can increase understanding of the effects of policies and programs on issues of equity. +Example: In a performance audit assessing the granting of waivers from particular requirements, auditors may use disaggregated data about waiver recipients to assess whether different groups or communities were treated fairly and equally in the process. +Example: In a performance audit assessing a grant program aimed at expanding internet access, auditors may assess the extent to which formulas, criteria, or other factors (such as matching funds or capital requirements) considered in the distribution of grant funds may be to the specific advantage or disadvantage of certain groups, regions, or communities, thereby causing inequities. +Example: In a performance audit assessing scholarship outcomes in higher education programs, auditors may report on the distribution of scholarships by race, gender identity, and income to illuminate potential disparities among scholarship recipients. +These concepts may overlap. For example, efficiency may also be a component of effectiveness. Similarly, when appropriate within the context of the program and audit objectives, auditors may disaggregate the results of performance audits that focus on efficiency or effectiveness +Page 5 +For More Information +issues to illuminate inequities in program administration or in distribution of public services. +While all of these concepts are important to administering government programs responsibly, it is up to the professional judgment of the auditors to determine the specific concepts that are relevant in conducting the performance audit and reporting the results. Auditors’ professional judgments are informed by, among other things, the needs of the users of the audit reports; the nature, context, and objectives of the program or activity under audit; and the public interest. +To view the current Yellow Book, visit https://www.gao.gov/yellowbook. +For technical assistance, call (202) 512-9535 or email yellowbook@gao.gov. +Page 6 \ No newline at end of file diff --git a/example-docs/test_evaluate_files/unstructured_output_cct/currency.csv.txt b/example-docs/test_evaluate_files/unstructured_output_cct/currency.csv.txt new file mode 100644 index 000000000..5b8f3e4e7 --- /dev/null +++ b/example-docs/test_evaluate_files/unstructured_output_cct/currency.csv.txt @@ -0,0 +1,823 @@ + + + +Code +Symbol +Name + + +AED +د.إ +United Arab Emirates d + + +AFN +؋ +Afghan afghani + + +ALL +L +Albanian lek + + +AMD +AMD +Armenian dram + + +ANG +ƒ +Netherlands Antillean gu + + +AOA +Kz +Angolan kwanza + + +ARS +$ +Argentine peso + + +AUD +$ +Australian dollar + + +AWG +Afl. +Aruban florin + + +AZN +AZN +Azerbaijani manat + + +BAM +KM +Bosnia and Herzegovina + + +BBD +$ +Barbadian dollar + + +BDT +৳ +Bangladeshi taka + + +BGN +лв. +Bulgarian lev + + +BHD +.د.ب +Bahraini dinar + + +BIF +Fr +Burundian franc + + +BMD +$ +Bermudian dollar + + +BND +$ +Brunei dollar + + +BOB +Bs. +Bolivian boliviano + + +BRL +R$ +Brazilian real + + +BSD +$ +Bahamian dollar + + +BTC +฿ +Bitcoin + + +BTN +Nu. +Bhutanese ngultrum + + +BWP +P +Botswana pula + + +BYR +Br +Belarusian ruble (old)' + + +BYN +Br +Belarusian ruble + + +BZD +$ +Belize dollar + + +CAD +$ +Canadian dollar + + +CDF +Fr +Congolese franc + + +CHF +CHF +Swiss franc + + +CLP +$ +Chilean peso + + +CNY +¥ +Chinese yuan + + +COP +$ +Colombian peso + + +CRC +₡ +Costa Rican colón + + +CUC +$ +Cuban convertible peso') + + +CUP +$ +Cuban peso + + +CVE +$ +Cape Verdean escudo + + +CZK +Kč +Czech koruna + + +DJF +Fr +Djiboutian franc + + +DKK +DKK +Danish krone + + +DOP +RD$ +Dominican peso + + +DZD +د.ج +Algerian dinar + + +EGP +EGP +Egyptian pound + + +ERN +Nfk +Eritrean nakfa + + +ETB +Br +Ethiopian birr + + +EUR +€ +Euro + + +FJD +$ +Fijian dollar + + +FKP +£ +Falkland Islands pound') + + +GBP +£ +Pound sterling + + +GEL +₾ +Georgian lari + + +GGP +£ +Guernsey pound + + +GHS +₵ +Ghana cedi + + +GIP +£ +Gibraltar pound + + +GMD +D +Gambian dalasi + + +GNF +Fr +Guinean franc + + +GTQ +Q +Guatemalan quetzal + + +GYD +$ +Guyanese dollar + + +HKD +$ +Hong Kong dollar + + +HNL +L +Honduran lempira + + +HRK +kn +Croatian kuna + + +HTG +G +Haitian gourde + + +HUF +Ft +Hungarian forint + + +IDR +Rp +Indonesian rupiah + + +ILS +₪ +Israeli new shekel + + +IMP +£ +Manx pound + + +INR +₹ +Indian rupee + + +IQD +ع.د +Iraqi dinar + + +IRR +﷼ +Iranian rial + + +IRT +تومان +Iranian toman + + +ISK +kr. +Icelandic króna + + +JEP +£ +Jersey pound + + +JMD +$ +Jamaican dollar + + +JOD +د.ا +Jordanian dinar + + +JPY +¥ +Japanese yen + + +KES +KSh +Kenyan shilling + + +KGS +сом +Kyrgyzstani som + + +KHR +៛ +Cambodian riel + + +KMF +Fr +Comorian franc + + +KPW +₩ +North Korean won + + +KRW +₩ +South Korean won + + +KWD +د.ك +Kuwaiti dinar + + +KYD +$ +Cayman Islands dollar + + +KZT +₸ +Kazakhstani tenge + + +LAK +₭ +Lao kip + + +LBP +ل.ل +Lebanese pound + + +LKR +රු +Sri Lankan rupee + + +LRD +$ +Liberian dollar + + +LSL +L +Lesotho loti + + +LYD +ل.د +Libyan dinar + + +MAD +د.م. +Moroccan dirham + + +MDL +MDL +Moldovan leu + + +MGA +Ar +Malagasy ariary + + +MKD +ден +Macedonian denar + + +MMK +Ks +Burmese kyat + + +MNT +₮ +Mongolian tögrög + + +MOP +P +Macanese pataca + + +MRU +UM +Mauritanian ouguiya + + +MUR +₨ +Mauritian rupee + + +MVR +.ރ +Maldivian rufiyaa + + +MWK +MK +Malawian kwacha + + +MXN +$ +Mexican peso + + +MYR +RM +Malaysian ringgit + + +MZN +MT +Mozambican metical + + +NAD +N$ +Namibian dollar + + +NGN +₦ +Nigerian naira + + +NIO +C$ +Nicaraguan córdoba + + +NOK +kr +Norwegian krone + + +NPR +₨ +Nepalese rupee + + +NZD +$ +New Zealand dollar + + +OMR +ر.ع. +Omani rial + + +PAB +B/. +Panamanian balboa + + +PEN +S/ +Sol + + +PGK +K +Papua New Guinean kina') + + +PHP +₱ +Philippine peso + + +PKR +₨ +Pakistani rupee + + +PLN +zł +Polish złoty + + +PRB +р. +Transnistrian ruble + + +PYG +₲ +Paraguayan guaraní + + +QAR +ر.ق +Qatari riyal + + +RON +lei +Romanian leu + + +RSD +рсд +Serbian dinar + + +RUB +₽ +Russian ruble + + +RWF +Fr +Rwandan franc + + +SAR +ر.س +Saudi riyal + + +SBD +$ +Solomon Islands dollar') + + +SCR +₨ +Seychellois rupee + + +SDG +ج.س. +Sudanese pound + + +SEK +kr +Swedish krona + + +SGD +$ +Singapore dollar + + +SHP +£ +Saint Helena pound + + +SLL +Le +Sierra Leonean leone + + +SOS +Sh +Somali shilling + + +SRD +$ +Surinamese dollar + + +SSP +£ +South Sudanese pound + + +STN +Db +São Tomé and Príncipe d + + +SYP +ل.س +Syrian pound + + +SZL +L +Swazi lilangeni + + +THB +฿ +Thai baht + + +TJS +ЅМ +Tajikistani somoni + + +TMT +m +Turkmenistan manat + + +TND +د.ت +Tunisian dinar + + +TOP +T$ +Tongan paʻanga + + +TRY +₺ +Turkish lira + + +TTD +$ +Trinidad and Tobago doll + + +TWD +NT$ +New Taiwan dollar + + +TZS +Sh +Tanzanian shilling + + +UAH +₴ +Ukrainian hryvnia + + +UGX +UGX +Ugandan shilling + + +USD +$ +United States (US) dolla + + +UYU +$ +Uruguayan peso + + +UZS +UZS +Uzbekistani som + + +VEF +Bs F +Venezuelan bolívar + + +VES +Bs.S +Bolívar soberano + + +VND +₫ +Vietnamese đồng + + +VUV +Vt +Vanuatu vatu + + +WST +T +Samoan tālā + + +XAF +CFA +Central African CFA fr + + +XCD +$ +East Caribbean dollar + + +XOF +CFA +West African CFA franc + + +XPF +Fr +CFP franc + + +YER +﷼ +Yemeni rial + + +ZAR +R +South African rand + + +ZMW +ZK +Zambian kwacha + + diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py index 478e052ad..a3306533b 100644 --- a/test_unstructured/metrics/test_evaluate.py +++ b/test_unstructured/metrics/test_evaluate.py @@ -20,6 +20,7 @@ TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files") UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output" GOLD_CCT_DIRNAME = "gold_standard_cct" GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type" +UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct" @pytest.fixture() @@ -51,6 +52,22 @@ def test_text_extraction_evaluation(): assert df.iloc[0].filename == "Bank Good Credit Loan.pptx" +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") +@pytest.mark.usefixtures("_cleanup_after_test") +def test_text_extraction_evaluation_type_txt(): + output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_CCT_DIRNAME) + source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) + export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct_txt") + measure_text_extraction_accuracy( + output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="txt" + ) + assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv")) + df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t") + assert len(df) == 3 + assert len(df.columns) == 5 + assert df.iloc[0].filename == "Bank Good Credit Loan.pptx" + + @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") @pytest.mark.usefixtures("_cleanup_after_test") def test_element_type_evaluation(): @@ -81,6 +98,7 @@ def test_text_extraction_takes_list(): export_dir=export_dir, ) # check that only the listed files are included + assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv")) df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t") assert len(df) == len(output_list) @@ -96,3 +114,14 @@ def test_text_extraction_grouping(): ) df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t") assert len(df) == 4 # metrics row and doctype rows + + +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") +def test_text_extraction_wrong_type(): + output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) + source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) + export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") + with pytest.raises(ValueError): + measure_text_extraction_accuracy( + output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="wrong" + ) diff --git a/test_unstructured/metrics/test_utils.py b/test_unstructured/metrics/test_utils.py new file mode 100644 index 000000000..4186b4441 --- /dev/null +++ b/test_unstructured/metrics/test_utils.py @@ -0,0 +1,35 @@ +import pytest + +from unstructured.metrics.utils import ( + _mean, + _pstdev, + _stdev, + _uniquity_file, +) + + +@pytest.mark.parametrize( + ("numbers", "expected_mean", "expected_stdev", "expected_pstdev"), + [ + ([2, 5, 6, 7], 5, 2.16, 1.871), + ([1, 100], 50.5, 70.004, 49.5), + ([1], 1, None, None), + ([], None, None, None), + ], +) +def test_stats(numbers, expected_mean, expected_stdev, expected_pstdev): + mean = _mean(numbers) + stdev = _stdev(numbers) + pstdev = _pstdev(numbers) + assert mean == expected_mean + assert stdev == expected_stdev + assert pstdev == expected_pstdev + + +@pytest.mark.parametrize( + ("filenames"), + [("filename.ext", "filename (1).ext", "randomfile.ext", "filename.txt", "filename (5).txt")], +) +def test_uniquity_file(filenames): + final_filename = _uniquity_file(filenames, "filename.ext") + assert final_filename == "filename (2).ext" diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py index c0a3cfab4..4cbcba159 100755 --- a/unstructured/ingest/evaluate.py +++ b/unstructured/ingest/evaluate.py @@ -55,18 +55,34 @@ def main(): default=False, help="Add the flag to show progress bar.", ) +@click.option( + "--output_type", + type=str, + default="json", + show_default=True, + help="Takes in either `txt` or `json` as output_type.", +) def measure_text_extraction_accuracy_command( output_dir: str, source_dir: str, export_dir: str, weights: Tuple[int, int, int], visualize: bool, + output_type: str, output_list: Optional[List[str]] = None, source_list: Optional[List[str]] = None, grouping: Optional[str] = None, ): return measure_text_extraction_accuracy( - output_dir, source_dir, output_list, source_list, export_dir, grouping, weights, visualize + output_dir, + source_dir, + output_list, + source_list, + export_dir, + grouping, + weights, + visualize, + output_type, ) diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 717ae093a..a8e560062 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -2,11 +2,9 @@ import logging import os -import statistics import sys -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple -import click import pandas as pd from tqdm import tqdm @@ -15,16 +13,26 @@ from unstructured.metrics.element_type import ( get_element_type_frequency, ) from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text -from unstructured.staging.base import elements_from_json, elements_to_text +from unstructured.metrics.utils import ( + _display, + _format_grouping_output, + _listdir_recursive, + _mean, + _prepare_output_cct, + _pstdev, + _read_text_file, + _stdev, + _write_to_file, +) -logger = logging.getLogger("unstructured.ingest") +logger = logging.getLogger("unstructured.eval") handler = logging.StreamHandler() -handler.name = "ingest_log_handler" +handler.name = "eval_log_handler" formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s") handler.setFormatter(formatter) # Only want to add the handler once -if "ingest_log_handler" not in [h.name for h in logger.handlers]: +if "eval_log_handler" not in [h.name for h in logger.handlers]: logger.addHandler(handler) logger.setLevel(logging.DEBUG) @@ -42,6 +50,7 @@ def measure_text_extraction_accuracy( grouping: Optional[str] = None, weights: Tuple[int, int, int] = (2, 1, 1), visualize: bool = False, + output_type: str = "json", ) -> None: """ Loops through the list of structured output from all of `output_dir` or selected files from @@ -57,15 +66,27 @@ def measure_text_extraction_accuracy( source_list = _listdir_recursive(source_dir) if not output_list: - print("No output files to calculate to edit distances for, exiting") + logger.info("No output files to calculate to edit distances for, exiting") sys.exit(0) + if output_type not in ["json", "txt"]: + raise ValueError( + f"Specified file type under `output_dir` or `output_list` should be one of \ + `json` or `txt`. The given file type is {output_type}, exiting." + ) + if not all(_.endswith(output_type) for _ in output_list): + logger.warning( + "The directory contains file type inconsistent with the given input. \ + Please note that some files will be skipped." + ) rows = [] + ext_index = -(len(output_type) + 1) # assumption: output file name convention is name-of-file.doc.json # NOTE(klaijan) - disable=True means to not show, disable=False means to show the progress bar for doc in tqdm(output_list, leave=False, disable=not visualize): # type: ignore - filename = (doc.split("/")[-1]).split(".json")[0] + # filename = (doc.split("/")[-1]).split(f".{output_type}")[0] + filename = os.path.basename(doc)[:ext_index] doctype = filename.rsplit(".", 1)[-1] fn_txt = filename + ".txt" connector = doc.split("/")[0] if len(doc.split("/")) > 1 else None @@ -77,11 +98,14 @@ def measure_text_extraction_accuracy( fn_txt = fn + ".txt" if fn_txt in source_list: # type: ignore - output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc))) - source_cct = _read_text(os.path.join(source_dir, fn_txt)) + try: + output_cct = _prepare_output_cct(os.path.join(output_dir, doc), output_type) + source_cct = _read_text_file(os.path.join(source_dir, fn_txt)) + except Exception: + # if any of the output/source file is unable to open, skip the loop + continue accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3) percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3) - rows.append([filename, doctype, connector, accuracy, percent_missing]) headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"] @@ -146,8 +170,8 @@ def measure_element_type_accuracy( connector = doc.split("/")[0] if len(doc.split("/")) > 1 else None if fn_json in source_list: # type: ignore - output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc))) - source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json))) + output = get_element_type_frequency(_read_text_file(os.path.join(output_dir, doc))) + source = get_element_type_frequency(_read_text_file(os.path.join(source_dir, fn_json))) accuracy = round(calculate_element_type_percent_match(output, source), 3) rows.append([filename, doctype, connector, accuracy]) @@ -163,88 +187,3 @@ def measure_element_type_accuracy( _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df) _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df) _display(agg_df) - - -def _listdir_recursive(dir: str): - listdir = [] - for dirpath, _, filenames in os.walk(dir): - for filename in filenames: - # Remove the starting directory from the path to show the relative path - relative_path = os.path.relpath(dirpath, dir) - if relative_path == ".": - listdir.append(filename) - else: - listdir.append(f"{relative_path}/{filename}") - return listdir - - -def _format_grouping_output(*df): - return pd.concat(df, axis=1).reset_index() - - -def _display(df): - if len(df) == 0: - return - headers = df.columns.tolist() - col_widths = [ - max(len(header), max(len(str(item)) for item in df[header])) for header in headers - ] - click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers))) - click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1)) - for _, row in df.iterrows(): - formatted_row = [] - for item in row: - if isinstance(item, float): - formatted_row.append(f"{item:.3f}") - else: - formatted_row.append(str(item)) - click.echo( - " ".join(formatted_row[i].ljust(col_widths[i]) for i in range(len(formatted_row))), - ) - - -def _write_to_file(dir: str, filename: str, df: pd.DataFrame, mode: str = "w"): - if mode not in ["w", "a"]: - raise ValueError("Mode not supported. Mode must be one of [w, a].") - if dir and not os.path.exists(dir): - os.makedirs(dir) - if "count" in df.columns: - df["count"] = df["count"].astype(int) - if "filename" in df.columns and "connector" in df.columns: - df.sort_values(by=["connector", "filename"], inplace=True) - df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w")) - - -def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3): - if len(scores) == 0: - return None - mean = statistics.mean(scores) - if not rounding: - return mean - return round(mean, rounding) - - -def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3): - # Filter out None values - scores = [score for score in scores if score is not None] - # Proceed only if there are more than one value - if len(scores) <= 1: - return None - if not rounding: - return statistics.stdev(scores) - return round(statistics.stdev(scores), rounding) - - -def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3): - scores = [score for score in scores if score is not None] - if len(scores) <= 1: - return None - if not rounding: - return statistics.pstdev(scores) - return round(statistics.pstdev(scores), rounding) - - -def _read_text(path): - with open(path, errors="ignore") as f: - text = f.read() - return text diff --git a/unstructured/metrics/utils.py b/unstructured/metrics/utils.py new file mode 100644 index 000000000..13c60d19e --- /dev/null +++ b/unstructured/metrics/utils.py @@ -0,0 +1,222 @@ +import logging +import os +import re +import statistics +from typing import List, Optional, Union + +import click +import pandas as pd + +from unstructured.staging.base import elements_from_json, elements_to_text + +logger = logging.getLogger("unstructured.eval") + + +def _prepare_output_cct(docpath: str, output_type: str) -> str: + """ + Convert given input document (path) into cct-ready. The function only support conversion + from `json` or `txt` file. + """ + try: + if output_type == "json": + output_cct = elements_to_text(elements_from_json(docpath)) + elif output_type == "txt": + output_cct = _read_text_file(docpath) + else: + raise ValueError( + f"File type not supported. Expects one of `json` or `txt`, \ + but received {output_type} instead." + ) + except ValueError as e: + logger.error(f"Could not read the file {docpath}") + raise e + return output_cct + + +def _listdir_recursive(dir: str) -> List[str]: + """ + Recursively lists all files in the given directory and its subdirectories. + Returns a list of all files found, with each file's path relative to the + initial directory. + """ + listdir = [] + for dirpath, _, filenames in os.walk(dir): + for filename in filenames: + # Remove the starting directory from the path to show the relative path + relative_path = os.path.relpath(dirpath, dir) + if relative_path == ".": + listdir.append(filename) + else: + listdir.append(os.path.join(relative_path, filename)) + return listdir + + +def _format_grouping_output(*df): + """ + Concatenates multiple pandas DataFrame objects along the columns (side-by-side) + and resets the index. + """ + return pd.concat(df, axis=1).reset_index() + + +def _display(df): + """ + Displays the evaluation metrics in a formatted text table. + """ + if len(df) == 0: + return + headers = df.columns.tolist() + col_widths = [ + max(len(header), max(len(str(item)) for item in df[header])) for header in headers + ] + click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers))) + click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1)) + for _, row in df.iterrows(): + formatted_row = [] + for item in row: + if isinstance(item, float): + formatted_row.append(f"{item:.3f}") + else: + formatted_row.append(str(item)) + click.echo( + " ".join(formatted_row[i].ljust(col_widths[i]) for i in range(len(formatted_row))), + ) + + +def _write_to_file( + dir: str, filename: str, df: pd.DataFrame, mode: str = "w", overwrite: bool = True +): + """ + Save the metrics report to tsv file. The function allows an option 1) to choose `mode` + as `w` (write) or `a` (append) and 2) to `overwrite` the file if filename existed or not. + """ + if mode not in ["w", "a"]: + raise ValueError("Mode not supported. Mode must be one of [w, a].") + if dir and not os.path.exists(dir): + os.makedirs(dir) + if "count" in df.columns: + df["count"] = df["count"].astype(int) + if "filename" in df.columns and "connector" in df.columns: + df.sort_values(by=["connector", "filename"], inplace=True) + if not overwrite: + filename = _get_non_duplicated_filename(dir, filename) + df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w")) + + +def _sorting_key(filename): + """ + A function that defines the sorting method for duplicated file names. For example, + with filename.ext filename (1).ext filename (2).ext filename (10).ext - this function + extracts the integer in the bracket and sort those numbers ascendingly. + """ + # Regular expression to find the number in the filename + numbers = re.findall(r"(\d+)", filename) + if numbers: + # If there's a number, return it as an integer for sorting + return int(numbers[-1]) + else: + # If no number, return 0 so these files come first + return 0 + + +def _uniquity_file(file_list, target_filename) -> str: + """ + Checks the duplicity of the file name from the list and run the numerical check + of the minimum number needed as extension to not overwrite the exising file. + Returns a string of file name in the format of `filename ().ext`. + """ + original_filename, extension = target_filename.rsplit(".", 1) + pattern = rf"^{re.escape(original_filename)}(?: \((\d+)\))?\.{re.escape(extension)}$" + duplicated_files = sorted([f for f in file_list if re.match(pattern, f)], key=_sorting_key) + + numbers = [] + for file in duplicated_files: + match = re.search(r"\((\d+)\)", file) + if match: + numbers.append(int(match.group(1))) + + numbers.sort() + + counter = 1 + for number in numbers: + if number == counter: + counter += 1 + else: + break + + return original_filename + " (" + str(counter) + ")." + extension + + +def _get_non_duplicated_filename(dir, filename) -> str: + """ + Helper function to calls the `_uniquity_file` function. Takes in directory and file name + to check on. + """ + filename = _uniquity_file(os.listdir(dir), filename) + return filename + + +def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3) -> Union[float, None]: + """ + Find mean from the list. Returns None if no element in the list. + + Args: + rounding (int): optional argument that allows user to define decimal points. Default at 3. + """ + if len(scores) == 0: + return None + mean = statistics.mean(scores) + if not rounding: + return mean + return round(mean, rounding) + + +def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3) -> Union[float, None]: + """ + Find standard deviation from the list. + Returns None if only 0 or 1 element in the list. + + Args: + rounding (int): optional argument that allows user to define decimal points. Default at 3. + """ + # Filter out None values + scores = [score for score in scores if score is not None] + # Proceed only if there are more than one value + if len(scores) <= 1: + return None + if not rounding: + return statistics.stdev(scores) + return round(statistics.stdev(scores), rounding) + + +def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3) -> Union[float, None]: + """ + Find population standard deviation from the list. + Returns None if only 0 or 1 element in the list. + + Args: + rounding (int): optional argument that allows user to define decimal points. Default at 3. + """ + scores = [score for score in scores if score is not None] + if len(scores) <= 1: + return None + if not rounding: + return statistics.pstdev(scores) + return round(statistics.pstdev(scores), rounding) + + +def _read_text_file(path): + """ + Reads the contents of a text file and returns it as a string. + """ + # Check if the file exists + if not os.path.exists(path): + raise FileNotFoundError(f"The file at {path} does not exist.") + + try: + with open(path, errors="ignore") as f: + text = f.read() + return text + except OSError as e: + # Handle other I/O related errors + raise IOError(f"An error occurred when reading the file at {path}: {e}")