
    .ih4                         d Z ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ej        Z ej        e          Z G d dej                  Z G d dej                  ZdS )	)PdfTextPagePdfTextSearcher    N)PdfiumError)
PdfTextObjc                        e Zd ZdZ fdZed             ZddZddZdd
Z	d Z
ddZd ZddZd Zd ZddZ xZS )r   a  
    Text page helper class.
    
    Hint:
        (py)pdfium itself does not implement layout analysis, such as detecting words/lines/paragraphs.
        However, there may be third-party extensions for this job, e.g.: https://github.com/VikParuchuri/pdftext
    
    Attributes:
        raw (FPDF_TEXTPAGE):
            The underlying PDFium textpage handle.
        page (PdfPage):
            Reference to the page this textpage belongs to.
    c                 z    || _         || _        t                                          t          j                   d S N)rawpagesuper__init__pdfium_cFPDFText_ClosePage)selfr
   r   	__class__s      W/var/www/html/analyses/venv/lib/python3.11/site-packages/pypdfium2/_helpers/textpage.pyr   zPdfTextPage.__init__    s2    	455555    c                     | j         S r	   )r   r   s    r   parentzPdfTextPage.parent%   s
    yr   Nignorec                    | j                                         }||d         }||d         }||d         }||d         }| ||||f}t          j        g |ddR  }|dk    rdS t	          j        |dz            }	t	          j        |	t	          j        t          j                            }
t          j        g ||
|R   |	j	        
                    d|          S )	a  
        Extract text from given boundaries, in PDF canvas units.
        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
        
        Parameters:
            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text on the page area in question, or an empty string if no text was found.
        
        Note:
            PDFium outputs CRLF (``\r\n``) style line breaks.
            This may be undesirable or confusing in some situations, e.g. when processing the output with an (unaware) parser on the command line.
            If this is an issue, replace ``\r\n`` with just ``\n``.
        Nr             	utf-16-leerrors)r   get_bboxr   FPDFText_GetBoundedTextctypescreate_string_buffercastPOINTERc_ushortr
   decode)r   leftbottomrighttopr   bboxargsn_charsbuffer
buffer_ptrs              r   get_text_boundedzPdfTextPage.get_text_bounded*   s      y!!##<7D>!WF=GE;q'CdC/2BDB$BBBBa<<2,Wq[99[)H)HII
(D$D
DGDDDDz  V <<<r   r   c                    ||k    rdS t          j        | |          }|dk    r|                     |dz   ||dz   |          S t          j        | |          }|dk    r|                     ||dz
  ||dz             S ||||fS )Nr   r   )r   "FPDFText_GetTextIndexFromCharIndex_get_active_text_range)r   c_startc_end	l_passive	r_passivet_startt_ends          r   r5   z"PdfTextPage._get_active_text_rangeP   s    U??1=dGLLb==..wqy%1iXXX;D%HHB;;..waIVWKXXXy)33r   r3   c                    |dk    r|                                  |z
  }|                     |||z   dz
            }|dk    rdS |\  }}}}||z  }|||z   z  }|dz   |z
  }	t          j        |	dz            }
t          j        |
t          j        t          j                            }t          j        | |||          }|	|k    sJ d|	 d|             |
j	        d|dz
  dz           
                    d	|
          S )a  
        Extract text from a given range.
        
        Parameters:
            index (int): Index of the first char to include.
            count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
            errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text in the range in question, or an empty string if no text was found.
        
        Warning:
            This method is limited to UCS-2, whereas :meth:`.get_text_bounded` provides full Unicode support.
        
        Note:
            * Like :meth:`.get_text_bounded`, this API also outputs CRLF style line breaks. See the note above.
            * The returned text's length does not have to match *count*, even if it will for most PDFs.
              This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
              This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
              Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
            * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
        r3   r   r   r   r   zBuffer too small: z vs Nr   r   )count_charsr5   r"   r#   r$   r%   r&   r   FPDFText_GetTextr
   r'   )r   indexcountr   active_ranger:   r;   r8   r9   in_countr/   r0   	out_counts                r   get_text_rangezPdfTextPage.get_text_range`   s(   . B;;$$&&.E 225%+a-HH12 0<,	9Y&&7W$,X\::[)H)HII
-dE5*MM	9$$$&T8&T&T&T&T$$$z*IaK?*+22;v2NNNr   c                 X    t          j        |           }|dk    rt          d          |S )zV
        Returns:
            int: The number of characters on the text page.
        r3   zFailed to get character count.)r   FPDFText_CountCharsr   )r   r.   s     r   r=   zPdfTextPage.count_chars   s1    
 .t44b==>???r   c                 \    t          j        | ||          }|dk    rt          d          |S )a  
        Parameters:
            index (int): Start character index.
            count (int): Character count to consider (defaults to -1 for all remaining).
        Returns:
            int: The number of text rectangles in the given character range.
        r3   zFailed to count rectangles.)r   FPDFText_CountRectsr   )r   r?   r@   n_rectss       r   count_rectszPdfTextPage.count_rects   s5     .tUEBBb==;<<<r   c                     t          j        | ||||          }|dk    rdS |dk    rt          d          |dk    s
J d            |S )a  
        Get the index of a character by position.
        
        Parameters:
            x (float): Horizontal position (in PDF canvas units).
            y (float): Vertical position.
            x_tol (float): Horizontal tolerance.
            y_tol (float): Vertical tolerance.
        Returns:
            int | None: The index of the character at or nearby the point (x, y).
            May be None if there is no character. If an internal error occurred, an exception will be raised.
        r3   Nz6An error occurred on attempt to get char index by pos.r   z8Negative return is not permitted (unhandled error code?))r   FPDFText_GetCharIndexAtPosr   )r   xyx_toly_tolr?   s         r   	get_indexzPdfTextPage.get_index   s[     3D!QuMMB;;4b[[VWWWzzzUzzzr   Fc                    |rIt          j                    }t          j        | ||          }|j        |j        |j        |j        f\  }}}}nst                      t                      t                      t                      f\  }}}}t          j        | |||||          }|j	        |j	        |j	        |j	        f\  }}}}|st          d          ||||fS )a  
        Get the bounding box of a single character.
        
        Parameters:
            index (int):
                Index of the character to work with, in the page's character array.
            loose (bool):
                Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zFailed to get charbox.)r   FS_RECTFFPDFText_GetLooseCharBoxr(   r)   r*   r+   c_doubleFPDFText_GetCharBoxvaluer   )	r   r?   looserectoklbrts	            r   get_charboxzPdfTextPage.get_charbox   s      	<$&&D24EEBDKTXEJAq!QQ!XZZXZZGJAq!Q-dE1aAFFB!'17AG;JAq!Q 	86777!Qzr   c                     t                      t                      t                      t                      f\  }}}}t          j        | |||||          }|st          d          |j        |j        |j        |j        fS )aX  
        Get the bounding box of a text rectangle at the given index.

        Attention:
            :meth:`.count_rects` must be called once with default params before subsequent :meth:`.get_rect` calls for this function to work.
        
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zzFailed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.))rV   r   FPDFText_GetRectr   rX   )r   r?   r\   r]   r^   r_   r[   s          r   get_rectzPdfTextPage.get_rect   s     ZZXZZC
1a&tUAq!Q?? 	\  [  \  \  \!'1733r   c                 V    t          j        | |          }|sdS t          ||           S )a  
        Returns:
            PdfTextObj | None: A handle to the textobject that includes the char at *index*, or None if it could not be resolved (e.g. escape character).
        Tip:
            Textobjects can also be obtained through :meth:`.PdfPage.get_objects`.
        Ntextpage)r   FPDFText_GetTextObjectr   )r   r?   raw_objs      r   get_textobjzPdfTextPage.get_textobj   s6     1$>> 	4'D1111r   c                    t          |          dk    rt          d          |r|t          j        z  }|r|t          j        z  }|r|t          j        z  }|dz                       d          }t          j        |t          j	        t          j
                            }t          j        | |||          }	t          |	|           }
|                     |
           |
S )a  
        Locate text on the page.
        
        Parameters:
            text (str):
                The string to search for.
            index (int):
                Character index at which to start searching.
            match_case (bool):
                If True, the search will be case-specific (upper and lower letters treated as different characters).
            match_whole_word (bool):
                If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
            consecutive (bool):
                If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
            flags (int):
                Passthrough of raw pdfium searching flags. Note that you may want to use the boolean options instead.
        Returns:
            PdfTextSearcher: A helper object to search text.
        r   z#Text length must be greater than 0. r   )len
ValueErrorr   FPDF_MATCHCASEFPDF_MATCHWHOLEWORDFPDF_CONSECUTIVEencoder"   r$   r%   r&   FPDFText_FindStartr   _add_kid)r   textr?   
match_casematch_whole_wordconsecutiveflagsenc_textenc_text_ptrraw_searchersearchers              r   searchzPdfTextPage.search   s    , t99>>BCCC 	-X,,E 	2X11E 	/X..E6M))+66{8V^FO-L-LMM24ueTT"<66hr   )NNNNr   )r   r   )r   r3   r   )r   r3   )F)r   FFFr   )__name__
__module____qualname____doc__r   propertyr   r1   r5   rD   r=   rJ   rR   r`   rc   ri   r}   __classcell__r   s   @r   r   r      s        6 6 6 6 6
   X#= #= #= #=L4 4 4 4 +O +O +O +O\       ,   84 4 4"2 2 2% % % % % % % %r   r   c                   J     e Zd ZdZ fdZed             Zd Zd Zd Z	 xZ
S )r   z
    Text searcher helper class.
    
    Attributes:
        raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
        textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
    c                 z    || _         || _        t                                          t          j                   d S r	   )r
   rf   r   r   r   FPDFText_FindClose)r   r
   rf   r   s      r   r   zPdfTextSearcher.__init__)  s2     455555r   c                     | j         S r	   re   r   s    r   r   zPdfTextSearcher.parent.  s
    }r   c                 x     ||           }|sd S t          j        |           }t          j        |           }||fS r	   )r   FPDFText_GetSchResultIndexFPDFText_GetSchCount)r   	find_funcr[   r?   r@   s        r   _get_occurrencezPdfTextSearcher._get_occurrence3  sF    Yt__ 	43D99-d33e|r   c                 @    |                      t          j                  S )z
        Returns:
            (int, int) | None: Start character index and count of the next occurrence, or None if the last occurrence was passed.
        )r   r   FPDFText_FindNextr   s    r   get_nextzPdfTextSearcher.get_next;      
 ##H$>???r   c                 @    |                      t          j                  S )z
        Returns:
            (int, int) | None: Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence), or None if the last occurrence was passed.
        )r   r   FPDFText_FindPrevr   s    r   get_prevzPdfTextSearcher.get_prevB  r   r   )r~   r   r   r   r   r   r   r   r   r   r   r   s   @r   r   r      s         6 6 6 6 6
   X  @ @ @@ @ @ @ @ @ @r   r   )__all__r"   loggingpypdfium2.rawr
   r   pypdfium2.internalinternalpdfium_ipypdfium2._helpers.miscr   pypdfium2._helpers.pageobjectsr   rV   	getLoggerr~   loggerAutoCloseabler   r    r   r   <module>r      s    -               % % % % % % / / / / / / 5 5 5 5 5 5?		8	$	$L L L L L8) L L L^'@ '@ '@ '@ '@x- '@ '@ '@ '@ '@r   