Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

jaa new req update #9

Open
Elanchezhian2712 opened this issue Feb 22, 2025 · 0 comments
Open

jaa new req update #9

Elanchezhian2712 opened this issue Feb 22, 2025 · 0 comments

Comments

@Elanchezhian2712
Copy link
Owner

Elanchezhian2712 commented Feb 22, 2025

@app.post("/extract-invoice")
async def extract_invoice(file: UploadFile = File(...), username: str = Form(...), db: Session = Depends(get_db)):
filename = file.filename

if not is_allowed_file(filename):
    db.close()
    raise HTTPException(
        status_code=400,
        detail="Invalid file type. Allowed formats are: .jpg, .jpeg, .png, .pdf, .jfif, .csv, .xlsx"
    )

try:
    file_bytes = await file.read()

    if filename.lower().endswith(('.csv', '.xlsx')):
        if filename.lower().endswith('.csv'):
            df = pd.read_csv(BytesIO(file_bytes))
        elif filename.lower().endswith('.xlsx'):
            df = pd.read_excel(BytesIO(file_bytes))

        if "Invoice link" not in df.columns:
            db.close()
            raise HTTPException(status_code=400, detail="Invoice link column not found")

        df['Invoice link'] = df['Invoice link'].fillna('-')
        extracted_data = []
        invalid_count = 0
        duplicate_count = 0
        success_count = 0
        request_count = 0
        failed_count = 0
        processed_links = set()

        for idx, link in enumerate(df['Invoice link'], 1):
            if not is_valid_url(link):
                invalid_count += 1
                print(f"Skipping invalid link: {link}")
                extracted_data.append({})
                invoice_data_db = models.InvoiceData(
                    url_link=link,
                    file_name=filename,
                    created_by=username,
                    invalid_success="Failed",
                )
                db.add(invoice_data_db)
                try:
                    db.commit()
                    db.refresh(invoice_data_db)
                except Exception as e:
                    db.rollback()
                    print(f"Database error saving invalid link data: {e}")
                continue


            if request_count >= 10:
                print(f"Processed {request_count} requests in the last minute. Waiting for the next minute...")
                await asyncio.sleep(60)
                request_count = 0

            data = await process_invoice_link(link, db, filename=filename, url_link=link) # Process content first

            if link in processed_links: # Check link duplicate AFTER processing
                duplicate_count += 1
                print(f"Link duplicate found in last 5 days for link: {link}")
                extracted_data.append({})

                successful_record = db.query(models.InvoiceData).filter(
                    models.InvoiceData.url_link == link,
                    models.InvoiceData.status_success == "Success"
                ).first()

                if successful_record:
                    invoice_data_db = models.InvoiceData(
                        invoice_no=successful_record.invoice_no,
                        supplier_gstin_number=successful_record.supplier_gstin_number,
                        invoice_amount=successful_record.invoice_amount,
                        invoice_date=successful_record.invoice_date,
                        url_link=link,
                        file_name=filename,
                        created_by=username,
                        duplicated_success="Duplicate Link", # Differentiate Duplicate Link vs Content
                        output=successful_record.output,

                    )
                else:
                    invoice_data_db = models.InvoiceData(
                        url_link=link,
                        file_name=filename,
                        created_by=username,
                        duplicated_success="Duplicate Link", # Differentiate Duplicate Link vs Content
                    )


                db.add(invoice_data_db)
                db.commit()
                db.refresh(invoice_data_db)
                continue


            if data.get("duplicate_content"):  # Check for content duplicate AFTER processing
                duplicate_count += 1
                print(f"Content duplicate found for link: {link}") # More accurate log message
                extracted_data.append({})

                invoice_no = data.get("Invoice No")
                supplier_gstin_number = data.get("Supplier GSTIN")
                invoice_amount_str = data.get("Total Amount", "0") # Default to "0" if missing
                invoice_date = data.get("Invoice Date")

                print(f"Content Duplicate Check - extract_invoice - Extracted Data: Invoice No: {invoice_no}, GSTIN: {supplier_gstin_number}, Amount: {invoice_amount_str}, Date: {invoice_date}") # Enhanced Logging - Extracted Data


                existing_record = db.query(models.InvoiceData).filter(
                    models.InvoiceData.invoice_no == invoice_no,
                    models.InvoiceData.supplier_gstin_number == supplier_gstin_number,
                    models.InvoiceData.invoice_date == invoice_date,
                    models.InvoiceData.invoice_amount == invoice_amount_str # Corrected: String-to-String comparison - NO float() conversion
                ).first()

                print("existing_record",existing_record)

                if existing_record: # Should always exist if "duplicate_content" is True
                    print(f"Content Duplicate Check - extract_invoice - Existing Record Found: Invoice No: {existing_record.invoice_no}, GSTIN: {existing_record.supplier_gstin_number}, Amount: {existing_record.invoice_amount}, Date: {existing_record.invoice_date}") # Enhanced Logging - Existing Record

                    invoice_data_db = models.InvoiceData(
                        invoice_no=existing_record.invoice_no,
                        supplier_gstin_number=existing_record.supplier_gstin_number,
                        invoice_amount=existing_record.invoice_amount,
                        invoice_date=existing_record.invoice_date,
                        url_link=link, # Still store the current link
                        file_name=filename,
                        created_by=username,
                        duplicated_success="Content Duplicate", # Mark as content duplicate
                        output=existing_record.output,
                    )
                    db.add(invoice_data_db)
                    db.commit()
                    db.refresh(invoice_data_db)
                else:
                    print(f"Warning: Content duplicate flagged but no existing record found for link: {link}") # This warning should be less likely now
                    print(f"Content Duplicate Check - extract_invoice - NO Existing Record Found Matching Extracted Data.") # Enhanced Logging - No Record Found

            else: # Normal processing if not content or link duplicate
                extracted_data.append(data)
                processed_links.add(link)
                request_count += 1
                if data:
                    success_count += 1
                    print(f"Extracted {idx} invoice(s) from the provided link.")

                    invoice_data_db = models.InvoiceData( # Create record here for success
                        invoice_no=data.get("Invoice No"),
                        supplier_gstin_number=data.get("Supplier GSTIN"),
                        invoice_amount=data.get("Total Amount", "0"), # Store as string to match DB type? Or convert DB column to numeric
                        invoice_date=data.get("Invoice Date"),
                        url_link=link,
                        file_name=filename,
                        created_by=username,
                        output=json.dumps(data), # Store JSON as string in DB
                        status_success="Success"
                    )
                    db.add(invoice_data_db)
                    db.commit()
                    db.refresh(invoice_data_db)


                else:
                    failed_count += 1
                    print(f"Extraction failed for link {link}")
                    invoice_data_db = models.InvoiceData( # Create record here for failure
                        url_link=link,
                        file_name=filename,
                        created_by=username,
                        status_success="Failed"
                    )
                    db.add(invoice_data_db)
                    db.commit()
                    db.refresh(invoice_data_db)


        total_count = success_count + duplicate_count + invalid_count + failed_count
        db.close()
        return JSONResponse(content={
            "extracted_data": extracted_data,
            "skipped_count": invalid_count,
            "duplicate_count": duplicate_count,
            "failed_count": failed_count,
            "success_count": success_count,
            "total_count": total_count,
            "file name": filename,
        })

    elif filename.lower().endswith('.pdf'):
        pdf_part = {
            "mime_type": "application/pdf",
            "data": file_bytes
        }
        extracted_data = await process_content(pdf_part, db, filename=filename)
        db.close()
        return JSONResponse(content={"extracted_data": extracted_data})

    else:
        pil_image = PIL.Image.open(BytesIO(file_bytes))
        pil_image = pil_image.convert("L")
        image_bytes = BytesIO()
        pil_image.save(image_bytes, format="PNG")
        image_part = image_format(image_bytes.getvalue())
        extracted_data = await process_content(image_part, db, filename=filename)
        db.close()
        return JSONResponse(content={"extracted_data": extracted_data})

except HTTPException as http_exc:
    db.close()
    raise http_exc
except Exception as e:
    db.close()
    return JSONResponse(status_code=500, content={"error": str(e)})

async def process_invoice_link(link: str, db: Session, filename: str, url_link: str):
retries = 10
retry_delay = 5

for attempt in range(retries):
    try:
        response = requests.get(link)
        response.raise_for_status()

        file_bytes = response.content
        file_ext = link.split('.')[-1].lower()

        if file_ext in ('jpg', 'jpeg', 'png', 'jfif'):
            pil_image = PIL.Image.open(BytesIO(file_bytes))
            pil_image = pil_image.convert("L")
            image_bytes = BytesIO()
            pil_image.save(image_bytes, format="PNG")
            image_part = image_format(image_bytes.getvalue())
            extracted_data = await process_content(image_part, db, filename=filename, url_link=url_link)

        elif file_ext == 'pdf':
            pdf_part = {
                "mime_type": "application/pdf",
                "data": file_bytes
            }
            extracted_data = await process_content(pdf_part, db, filename=filename, url_link=url_link)
        else:
            print(f"Unsupported file type in link: {link}")
            return {}

        # --- Content Duplicate Check against last 5 Days of Records ---
        invoice_no = extracted_data.get("Invoice No")
        supplier_gstin_number = extracted_data.get("Supplier GSTIN")
        invoice_amount_str = str(extracted_data.get("Total Amount", ""))
        invoice_date = extracted_data.get("Invoice Date")

        print(f"Extracted Data from process_invoice_link - process_content: Invoice No: {invoice_no}, GSTIN: {supplier_gstin_number}, Amount: {invoice_amount_str}, Date: {invoice_date}") # Enhanced Logging - Extracted from process_content

        if invoice_no and supplier_gstin_number and invoice_amount_str and invoice_date:
            five_days_ago = date.today() - timedelta(days=4) # Calculate date 5 days ago

            # Fetch invoices from the last 5 days
            recent_invoices = (
                db.query(models.InvoiceData)
                .filter(models.InvoiceData.created_at >= five_days_ago) # Filter by date
                .order_by(models.InvoiceData.id.desc())
                .all()
            )

            print(f"Checking invoices from last 5 days. Date threshold: {five_days_ago}")
            print(f"Number of recent invoices to check: {len(recent_invoices)}")


            if not recent_invoices: # Explicitly check if recent_invoices is empty
                print("No previous invoices found in last 5 days. Skipping duplicate content check against recent invoices for now.")
            else:
                for existing_invoice in recent_invoices:
                    if existing_invoice.url_link == link: # Check for link duplicate first - now checked AFTER content processing
                        print(f"Link duplicate found in last 5 days for link: {link}")
                        return {
                            "duplicate_content": True,
                            "Invoice No": invoice_no,
                            "Supplier GSTIN": supplier_gstin_number,
                            "Total Amount": invoice_amount_str,
                            "Invoice Date": invoice_date,
                            "duplicate_link": True
                        }
                for existing_invoice in recent_invoices: # Content duplicate check if no link duplicate
                    existing_invoice_amount_str = str(existing_invoice.invoice_amount) if existing_invoice.invoice_amount is not None else "0" # Handle potential None values from DB

                    print(f"process_invoice_link - Comparing against DB record: Invoice No: {existing_invoice.invoice_no}, GSTIN: {existing_invoice.supplier_gstin_number}, Amount: {existing_invoice_amount_str}, Date: {existing_invoice.invoice_date}") # Enhanced Logging - DB Record in Loop
                    # Content Duplicate Check: Invoice No, Supplier GSTIN, Invoice Date, and Invoice Amount MUST match
                    if (existing_invoice.invoice_no == invoice_no and
                            existing_invoice.supplier_gstin_number == supplier_gstin_number and
                            existing_invoice.invoice_date == invoice_date and
                            existing_invoice_amount_str == invoice_amount_str): # String-to-String comparison

                        print(f"Content duplicate found for invoice: {invoice_no}, GSTIN: {supplier_gstin_number}, Date: {invoice_date}, Amount: {invoice_amount_str}") # Enhanced Logging - Content Duplicate Found
                        return {
                            "duplicate_content": True,
                            "Invoice No": invoice_no,
                            "Supplier GSTIN": supplier_gstin_number,
                            "Total Amount": invoice_amount_str,
                            "Invoice Date": invoice_date,
                            "duplicate_link": False
                        }

        return extracted_data

    except requests.exceptions.RequestException as e:
        print(f"Request error for link {link} (Attempt {attempt+1}/{retries}): {e}")
        if attempt < retries - 1:
            time.sleep(retry_delay)
        else:
            print(f"Request failed after multiple retries for link {link}. Skipping.")
            return {}
    except Exception as e:
        print(f"Error processing link {link}: {e}")
        return {}

return {}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant