<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Can anybody suggest a reliable and accurate way to use Tesseract OCR to turn a scanned PDF into a searchable PDF? in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Can-anybody-suggest-a-reliable-and-accurate-way-to-use/m-p/135247#M31672</link>
    <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;I am doing some research into non commercial solutions for taking a scanned PDF document and using Tesseract OCR to create a searchable PDF document. I am new to the HortonWorks world and community and any suggestions would be much appreciated.&lt;/P&gt;&lt;P&gt;
Thanks!&lt;/P&gt;</description>
    <pubDate>Mon, 13 Jun 2016 17:34:44 GMT</pubDate>
    <dc:creator>christopher_fra</dc:creator>
    <dc:date>2016-06-13T17:34:44Z</dc:date>
    <item>
      <title>Can anybody suggest a reliable and accurate way to use Tesseract OCR to turn a scanned PDF into a searchable PDF?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Can-anybody-suggest-a-reliable-and-accurate-way-to-use/m-p/135247#M31672</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;I am doing some research into non commercial solutions for taking a scanned PDF document and using Tesseract OCR to create a searchable PDF document. I am new to the HortonWorks world and community and any suggestions would be much appreciated.&lt;/P&gt;&lt;P&gt;
Thanks!&lt;/P&gt;</description>
      <pubDate>Mon, 13 Jun 2016 17:34:44 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Can-anybody-suggest-a-reliable-and-accurate-way-to-use/m-p/135247#M31672</guid>
      <dc:creator>christopher_fra</dc:creator>
      <dc:date>2016-06-13T17:34:44Z</dc:date>
    </item>
    <item>
      <title>Re: Can anybody suggest a reliable and accurate way to use Tesseract OCR to turn a scanned PDF into a searchable PDF?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Can-anybody-suggest-a-reliable-and-accurate-way-to-use/m-p/135248#M31673</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/11095/christopher-frankland.html" nodeid="11095"&gt;@Christopher Frankland&lt;/A&gt; here are some resources I would recommend checking out:&lt;/P&gt;&lt;P&gt;&lt;A href="https://community.hortonworks.com/articles/136/how-to-search-for-text-in-an-image.html" target="_blank"&gt;https://community.hortonworks.com/articles/136/how-to-search-for-text-in-an-image.html&lt;/A&gt; By &lt;A rel="user" href="https://community.cloudera.com/users/34/ssen.html" nodeid="34"&gt;@Saptak Sen&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Another fun one using Apache Nifi by &lt;A rel="user" href="https://community.cloudera.com/users/166/jdyer.html" nodeid="166"&gt;@Jeremy Dyer&lt;/A&gt; &lt;/P&gt;&lt;P&gt;&lt;A href="https://community.hortonworks.com/articles/28380/nifi-ocr-using-apache-nifi-to-read-childrens-books.html" target="_blank"&gt;https://community.hortonworks.com/articles/28380/nifi-ocr-using-apache-nifi-to-read-childrens-books.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;And here's a great tutorial with its use:&lt;/P&gt;&lt;P&gt;&lt;A href="http://hortonworks.com/hadoop-tutorial/indexing-and-searching-text-within-images-with-apache-solr/" target="_blank"&gt;http://hortonworks.com/hadoop-tutorial/indexing-and-searching-text-within-images-with-apache-solr/&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Here's also a blog post that is older, so please check commands, but it has some important lessons when it comes to accuracy, the quality and resolution of the PDF will greatly affect your results.&lt;/P&gt;&lt;P&gt;&lt;A href="http://kiirani.com/2013/03/22/tesseract-pdf.html" target="_blank"&gt;http://kiirani.com/2013/03/22/tesseract-pdf.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Posting this as I think it's interesting as well to examine other effects on the source data that might affect accuracy:&lt;/P&gt;&lt;P&gt;&lt;A href="http://www.assistivetechnology.vcu.edu/wp-content/uploads/sites/1864/2013/09/pxc3882784.pdf" target="_blank"&gt;http://www.assistivetechnology.vcu.edu/wp-content/uploads/sites/1864/2013/09/pxc3882784.pdf&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 13 Jun 2016 17:54:37 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Can-anybody-suggest-a-reliable-and-accurate-way-to-use/m-p/135248#M31673</guid>
      <dc:creator>henrysowell</dc:creator>
      <dc:date>2016-06-13T17:54:37Z</dc:date>
    </item>
  </channel>
</rss>

