Print Page | Close Window

Extracting Text By CSV Coordinates

Printed From: Debenu Quick PDF Library - PDF SDK Community Forum
Category: For Users of the Library
Forum Name: I need help - I can help
Forum Description: Problems and solutions while programming with the Debenu Quick PDF Library and Debenu PDF Viewer SDK
URL: http://www.quickpdf.org/forum/forum_posts.asp?TID=3805
Printed Date: 27 Apr 24 at 9:51PM
Software Version: Web Wiz Forums 11.01 - http://www.webwizforums.com


Topic: Extracting Text By CSV Coordinates
Posted By: andreweberle
Subject: Extracting Text By CSV Coordinates
Date Posted: 21 Apr 20 at 3:16AM
Hello,
I'm trying to extract text by the coordinates using C#
I first get the coordinates like so

List<TextExtraction> textExtractions = quickPdf.ExtractFilePageText(pdfPath, null, 1, 3).ToCSV('\n').ToList();

this converts the CSV data into a generic list.
the list object looks like this.

    public class TextExtraction
    {
        public double FontSize { get; set; }
        public string FontName { get; set; }
        public string FontColour { get; set; }
        public string Text { get; set; }
        public List<TextPoint> Points { get; set; }
    }

    public class TextPoint
    {
        public Vector X1Y1 { get; set; }
        public Vector X2Y2 { get; set; }
        public Vector X3Y3 { get; set; }
        public Vector X4Y4 { get; set; }
    }

I am then trying to http://https://www.debenu.com/docs/pdf_library_reference/SetTextExtractionArea.php" rel="nofollow - Set The Text Extraction Area
however I am having trouble getting the correct coordinates so it only gets that particular text.

var bottomLeft = textExtractions[2].Points[0].X4Y4.X;
var topRight = textExtractions[2].Points[0].X2Y2.Y;
var width = textExtractions[2].Points[0].X2Y2.X;
var height = textExtractions[2].Points[0].X3Y3.Y;

quickPdf.SetTextExtractionArea(bottomLeft, topRight, width, height);
string getTextByCoordinates = quickPdf.GetPageText(3);

Here is an example of the data. ( I have removed most of the lines to save space)
"CIDFont+F1",#000000,11.04,481.44,735.4012,483.9351,735.4012,483.9351,748.8706,481.44,748.8706," "
"CIDFont+F1",#000000,11.04,36,724.0012,38.4951,724.0012,38.4951,737.4706,36,737.4706," "
"CIDFont+F1",#000000,11.04,36,709.4812,95.8794,709.4812,95.8794,722.9506,36,722.9506,"TAX INVOICE "
"CIDFont+F1",#000000,11.04,107.9576,709.4812,110.4528,709.4812,110.4528,722.9506,107.9576,722.9506," "
"CIDFont+F1",#000000,11.04,143.9495,709.4812,146.4447,709.4812,146.4447,722.9506,143.9495,722.9506," "
"CIDFont+F1",#000000,11.04,180.0518,709.4812,182.5469,709.4812,182.5469,722.9506,180.0518,722.9506," "
"CIDFont+F1",#000000,11.04,216.0436,709.4812,229.6547,709.4812,229.6547,722.9506,216.0436,722.9506,"25 "




Replies:
Posted By: Ingo
Date Posted: 21 Apr 20 at 7:42AM
Hi Andrew,

Before extraction you should set the origin you wanna have:
SetOrigin
https://www.debenu.com/docs/pdf_library_reference/SetOrigin.php
To set your local measurementunits is good as well:
SetMeasurementUnits
https://www.debenu.com/docs/pdf_library_reference/SetMeasurementUnits.php

Because you can have a document with rotated textcontent or other specialities you should use:
CombineContentStreams
https://www.debenu.com/docs/pdf_library_reference/CombineContentStreams.php
NormalizePage
https://www.debenu.com/docs/pdf_library_reference/NormalizePage.php

Now starts your extraction and it will work like expected ;-)


Cheers and welcome here,
Ingo



-------------
Cheers,
Ingo



Posted By: andreweberle
Date Posted: 23 Apr 20 at 12:50AM
Hey Ingo,

Thanks very much for your reply.
I was able to achieve this with your advice.

Here is how I achieved it for future people.
Although it can detect other lines,
from the testing I have done the word you want will always be the first line, anything else can be disregarded. 

    class Program
    {
        public static PDFLibrary QP = new PDFLibrary("DebenuPDFLibraryDLL1016.dll");
        static void Main(string[] args)
        {    
            QP.UnlockKey(LICENCE_KEY);

            if (QP.Unlocked() > 0)
            {
                // Load The File.
                QP.LoadFromFile(pdfPath, null);
                QP.SetOrigin(1);
                QP.SetMeasurementUnits(2);
                QP.CombineContentStreams();
                QP.NormalizePage(2);

                // Get The Text Collection.
                List<TextExtraction> textCollection = QP.GetPageText(3).ToList("\r\n").ToList();


                // Get Rec.
                Rect rec = textCollection[6].Points[0].Rect;

                // X4 Y4 -- X3
                QP.DrawBox(rec.Left, rec.Top, rec.Width, rec.Height, 0);

                // Set The Text Region.
                if (QP.SetTextExtractionArea(rec.Left, rec.Top, rec.Width, rec.Height) > 0)
                {
                    // Attempt To Get The Text From The Selected Region.
                    string text = QP.GetPageText(3);

                    // Save The New File.
                    QP.SaveToFile(newPath);

                    // Print Text.
                    Console.WriteLine(text);
                }

                // Release Library.
                QP.ReleaseLibrary();

                // Open The PDF.
                System.Diagnostics.Process.Start(newPath);
                Console.ReadKey();
            }
        }
    }

    /// <summary>
    /// 
    /// </summary>
    public class TextExtraction
    {
        public string FontName { get; set; }
        public string FontColour { get; set; }
        public string Text { get; set; }
        public List<TextPoint> Points { get; set; }
    }
    
    /// <summary>
    /// 
    /// </summary>
    public class TextPoint
    {
        public double FontSize { get; set; }
        public double TextWidth { get; set; }
        public (double,double) X1Y1 { get; set; }
        public (double, double) X2Y2 { get; set; }
        public (double, double) X3Y3 { get; set; }
        public (double, double) X4Y4 { get; set; }
        public Rect Rect { get; set; }
    }
    
    /// <summary>
    /// 
    /// </summary>
    public static class Extenstions
    {
        static readonly Regex CsvSplit = new Regex("(?:^|,)(\"(?:[^\"]+|\"\")*\"|[^,]*)", RegexOptions.Compiled);

        /// <summary>
        /// 
        /// </summary>
        /// <param name="payload"></param>
        /// <param name="c"></param>
        /// <returns></returns>
        public static IEnumerable<TextExtraction> ToList(this string payload, string c)
        {       
            // Split The Payload String.
            List<string> payloadCollection = payload.Split(new string[] {c}, StringSplitOptions.RemoveEmptyEntries).ToList();
            payloadCollection.Remove(payloadCollection.Last());

            foreach (string str in payloadCollection)
            {
                StringBuilder row = new StringBuilder();

                // Split The String To Make It Easier To
                // Get Each Object.
                foreach (Match match in CsvSplit.Matches(str))
                {
                    row.Append(match.Value.TrimStart(',') + '\t');
                }

                row.Length--;             

                string[] obj = row.ToString().Split('\t');

                //Create The Text Extraction Object.
                TextExtraction textExtraction = new TextExtraction()
                {
                    FontName = obj[0],
                    FontColour = obj[1],
                    Text = obj.Last()
                };

                textExtraction.Text.Replace("\"", "");
                textExtraction.Text.TrimEnd();

                // Create The Point.
                textExtraction.Points = new List<TextPoint>
                {
                    new TextPoint()
                    {
                        FontSize = Convert.ToDouble(obj[2]),
                        X1Y1 = (Convert.ToDouble(obj[3]), Convert.ToDouble(obj[4])),
                        X2Y2 = (Convert.ToDouble(obj[5]), Convert.ToDouble(obj[6])),
                        X3Y3 = (Convert.ToDouble(obj[7]), Convert.ToDouble(obj[8])),
                        X4Y4 = (Convert.ToDouble(obj[9]), Convert.ToDouble(obj[10])),
                        Rect = new Rect(Convert.ToDouble(obj[9]) - 0.1, Convert.ToDouble(obj[10])  + 0.1, Convert.ToDouble(obj[3]), Convert.ToDouble(obj[2]) /72),
                    }
                };

                 // TODO: Get Text Width.
                // Get Text Width.
                textExtraction.Points[0].TextWidth = GetTextWidth(textExtraction);
                

                //Add The Point To The Collection.
                yield return textExtraction;
            }
        }
    }


Posted By: Ingo
Date Posted: 23 Apr 20 at 12:47PM
Hi Andrew,

thanks for your sample - this will help :)
I'll move your sample in the samples section later.
Again... Thanks a lot.



-------------
Cheers,
Ingo




Print Page | Close Window

Forum Software by Web Wiz Forums® version 11.01 - http://www.webwizforums.com
Copyright ©2001-2014 Web Wiz Ltd. - http://www.webwiz.co.uk