diff --git a/METADATA_ANALYSIS.md b/METADATA_ANALYSIS.md new file mode 100644 index 0000000..864a701 --- /dev/null +++ b/METADATA_ANALYSIS.md @@ -0,0 +1,59 @@ +# Image Metadata Analysis Implementation + +## Overview + +This enhancement adds automatic image metadata extraction to the book scraping workflow. The implementation includes: + +## New Functions + +### `extractImageMetadata[image_]` + +Extracts comprehensive metadata from Mathematica Image objects: + +- **ImageDimensions**: Gets width and height +- **Head**: Determines image type/format +- **ImageResolution**: Gets image resolution +- **ByteCount**: Calculates memory footprint + +Returns an Association with structured metadata. + +### Enhanced `func[lista_]` + +The main scraping function now: + +1. Initializes a `metadataList` to collect data +2. For each processed page: + - Extracts left and right page images + - Analyzes each image with `extractImageMetadata` + - Collects metadata with page context + - Handles failures gracefully +3. Exports all metadata to `book_images_metadata.csv` +4. Prints summary statistics + +## CSV Output Format + +| Column | Description | +|--------|-------------| +| Page | Source page number | +| ImageType | "Left" or "Right" page position | +| Status | "Success" or "Failed" processing status | +| Width | Image width in pixels | +| Height | Image height in pixels | +| Type | Mathematica image type | +| Resolution | Image resolution value | +| FileSize | Size in bytes | + +## Error Handling + +- Failed captures generate N/A values for technical metadata +- Status field tracks success/failure +- Process continues even if individual pages fail +- Final summary shows total records processed + +## Integration + +The metadata collection is seamlessly integrated into the existing workflow: +- Minimal performance impact +- No changes to existing PDF export functionality +- Preserves all original scraping behavior +- Adds value with zero disruption \ No newline at end of file diff --git a/README.md b/README.md index b46de8e..eec4673 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,34 @@ # wolfram-mathematica-codigo Web Scraping Internet Archive Books Using Mathematica + +## Features + +- Scrapes books from Internet Archive +- Captures page screenshots +- Processes images by extracting specific sections +- **NEW**: Extracts image metadata (size, format, resolution, file size) +- **NEW**: Generates CSV report with metadata for each processed image + +## Image Metadata Analysis + +The enhanced scraping workflow now includes automatic analysis of downloaded images with the following metadata extraction: + +- **Width and Height**: Image dimensions in pixels +- **Type**: Image format/type +- **Resolution**: Image resolution +- **File Size**: Size of the image in bytes +- **Status**: Processing status (Success/Failed) +- **Page**: Source page number +- **Image Type**: Position indicator (Left/Right page) + +### Output + +The system generates a `book_images_metadata.csv` file containing: +- One row per processed image +- All metadata fields as columns +- Status tracking for successful/failed processing + +### Functions Added + +- `extractImageMetadata[image]`: Extracts metadata from an image object +- Enhanced `func[lista]`: Main scraping function now collects and exports metadata diff --git a/last3.nb b/last3.nb index ad2a228..08affd2 100644 --- a/last3.nb +++ b/last3.nb @@ -214,11 +214,54 @@ Cell[BoxData[ CellLabel->"In[38]:=",ExpressionUUID->"281b0cfd-c2d2-654f-a043-c46fbcd7ed29"] }, Open ]], -Cell[CellGroupData[{ - -Cell["Capture pages", "Section", - CellChangeTimes->{{3.961967124424393*^9, 3.961967133263727*^9}, - 3.961977211620045*^9},ExpressionUUID->"50116fd0-d927-8e4b-94f2-\ +Cell[CellGroupData[{ + +Cell["Image Metadata Analysis Functions", "Section", + CellChangeTimes->{{3.961967124424393*^9, 3.961967133263727*^9}, + 3.961977211620045*^9, {3.962018300000000*^9, 3.962018310000000*^9}}, + ExpressionUUID->"metadata-analysis-section"], + +Cell["Function to extract image metadata including size, format, and resolution.", "Text", + CellChangeTimes->{{3.962018320000000*^9, 3.962018330000000*^9}}, + ExpressionUUID->"metadata-description"], + +Cell[BoxData[ + RowBox[{ + RowBox[{"extractImageMetadata", "[", "image_", "]"}], " ", ":=", " ", + RowBox[{"Module", "[", + RowBox[{ + RowBox[{"{", + RowBox[{"dims", ",", " ", "type", ",", " ", "resolution", ",", " ", "fileSize"}], "}"}], ",", "\n", " ", + RowBox[{ + RowBox[{"dims", " ", "=", " ", + RowBox[{"ImageDimensions", "[", "image", "]"}]}], ";", "\n", " ", + RowBox[{"type", " ", "=", " ", + RowBox[{"Head", "[", "image", "]"}]}], ";", "\n", " ", + RowBox[{"resolution", " ", "=", " ", + RowBox[{"ImageResolution", "[", "image", "]"}]}], ";", "\n", " ", + RowBox[{"fileSize", " ", "=", " ", + RowBox[{"ByteCount", "[", "image", "]"}]}], ";", "\n", " ", + RowBox[{"Association", "[", "\n", " ", + RowBox[{ + RowBox[{"\"\\"", " ", "->", " ", + RowBox[{"dims", "[", + RowBox[{"[", "1", "]"}], "]"}]}], ",", "\n", " ", + RowBox[{"\"\\"", " ", "->", " ", + RowBox[{"dims", "[", + RowBox[{"[", "2", "]"}], "]"}]}], ",", "\n", " ", + RowBox[{"\"\\"", " ", "->", " ", + RowBox[{"ToString", "[", "type", "]"}]}], ",", "\n", " ", + RowBox[{"\"\\"", " ", "->", " ", "resolution"}], ",", "\n", " ", + RowBox[{"\"\\"", " ", "->", " ", "fileSize"}]}], "\n", " ", "]"}]}]}], "\n", " ", "]"}]}]], "Input", + CellChangeTimes->{{3.962018340000000*^9, 3.962018400000000*^9}}, + ExpressionUUID->"extract-metadata-function"] +}, Open ]], + +Cell[CellGroupData[{ + +Cell["Capture pages", "Section", + CellChangeTimes->{{3.961967124424393*^9, 3.961967133263727*^9}, + 3.961977211620045*^9},ExpressionUUID->"50116fd0-d927-8e4b-94f2-\ 992f6543a073"], Cell[BoxData[ @@ -226,20 +269,22 @@ Cell[BoxData[ RowBox[{"func", "[", "lista_", "]"}], " ", ":=", " ", RowBox[{"Module", "[", RowBox[{ - RowBox[{"{", - RowBox[{ - "sesion", ",", " ", "loginURL", ",", " ", "email", ",", " ", - "password"}], "}"}], ",", "\n", " ", + RowBox[{"{", + RowBox[{ + "sesion", ",", " ", "loginURL", ",", " ", "email", ",", " ", + "password", ",", " ", "metadataList"}], "}"}], ",", "\n", " ", RowBox[{"(*", RowBox[{ "Open", " ", "the", " ", "url", " ", "of", " ", "the", " ", "page"}], "*)"}], "\n", " ", - RowBox[{ - RowBox[{"sesion", " ", "=", " ", - RowBox[{"StartWebSession", "[", - RowBox[{"\"\\"", ",", " ", - RowBox[{"Visible", " ", "->", " ", "False"}]}], "]"}]}], ";", "\n", - " ", + RowBox[{ + RowBox[{"metadataList", " ", "=", " ", + RowBox[{"{", "}"}]}], ";", "\n", " ", + RowBox[{"sesion", " ", "=", " ", + RowBox[{"StartWebSession", "[", + RowBox[{"\"\\"", ",", " ", + RowBox[{"Visible", " ", "->", " ", "False"}]}], "]"}]}], ";", "\n", + " ", RowBox[{ "loginURL", " ", "=", " ", "\"\\""}], ";", "\n", " ", @@ -286,13 +331,13 @@ click();\>\""}]}], "]"}], ";", "\n", " ", RowBox[{ RowBox[{"Module", "[", RowBox[{ - RowBox[{"{", - RowBox[{ - RowBox[{"x", " ", "=", " ", "#"}], ",", " ", "pags", ",", " ", - RowBox[{"tiem", " ", "=", " ", "False"}], ",", " ", - RowBox[{"intentos", " ", "=", " ", "0"}], ",", " ", - RowBox[{"maxIntentos", " ", "=", " ", "40"}], ",", " ", "foto", - ",", " ", "res"}], "}"}], ",", " ", + RowBox[{"{", + RowBox[{ + RowBox[{"x", " ", "=", " ", "#"}], ",", " ", "pags", ",", " ", + RowBox[{"tiem", " ", "=", " ", "False"}], ",", " ", + RowBox[{"intentos", " ", "=", " ", "0"}], ",", " ", + RowBox[{"maxIntentos", " ", "=", " ", "40"}], ",", " ", "foto", + ",", " ", "res", ",", " ", "metadata1", ",", " ", "metadata2"}], "}"}], ",", " ", RowBox[{ RowBox[{"pags", " ", "=", " ", RowBox[{ @@ -341,22 +386,83 @@ return img.src && img.complete && img.offsetWidth>0 && img.offsetHeight>0;});\ ",", " ", "\"\\""}], "]"}]}], ";", "\n", " ", - RowBox[{"res", " ", "=", " ", - RowBox[{"{", - RowBox[{ - RowBox[{"ImageTake", "[", - RowBox[{"foto", ",", " ", - RowBox[{"{", - RowBox[{"180", ",", " ", "2980"}], "}"}], ",", " ", - RowBox[{"{", - RowBox[{"400", ",", " ", "2220"}], "}"}]}], "]"}], ",", " ", - RowBox[{"ImageTake", "[", - RowBox[{"foto", ",", " ", - RowBox[{"{", - RowBox[{"180", ",", " ", "2980"}], "}"}], ",", " ", - RowBox[{"{", - RowBox[{"2180", ",", " ", "3960"}], "}"}]}], "]"}]}], - "}"}]}], ";", "\n", " ", + RowBox[{"res", " ", "=", " ", + RowBox[{"{", + RowBox[{ + RowBox[{"ImageTake", "[", + RowBox[{"foto", ",", " ", + RowBox[{"{", + RowBox[{"180", ",", " ", "2980"}], "}"}], ",", " ", + RowBox[{"{", + RowBox[{"400", ",", " ", "2220"}], "}"}]}], "]"}], ",", " ", + RowBox[{"ImageTake", "[", + RowBox[{"foto", ",", " ", + RowBox[{"{", + RowBox[{"180", ",", " ", "2980"}], "}"}], ",", " ", + RowBox[{"{", + RowBox[{"2180", ",", " ", "3960"}], "}"}]}], "]"}]}], + "}"}]}], ";", "\n", " ", + RowBox[{"(*", " ", + RowBox[{"Extract", " ", "metadata", " ", "for", " ", "each", " ", "image"}], " ", "*)"}], "\n", " ", + RowBox[{"If", "[", + RowBox[{ + RowBox[{"StringQ", "[", "foto", "]"}], ",", " ", + RowBox[{ + RowBox[{"metadata1", " ", "=", " ", + RowBox[{"Association", "[", + RowBox[{ + RowBox[{"\"\\"", " ", "->", " ", + RowBox[{"ToString", "[", "x", "]"}]}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}]}], "]"}]}], ";", "\n", " ", + RowBox[{"metadata2", " ", "=", " ", + RowBox[{"Association", "[", + RowBox[{ + RowBox[{"\"\\"", " ", "->", " ", + RowBox[{"ToString", "[", + RowBox[{"x", " ", "+", " ", "1"}], "]"}]}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}]}], "]"}]}]}], ",", "\n", " ", + RowBox[{ + RowBox[{"metadata1", " ", "=", " ", + RowBox[{"Join", "[", + RowBox[{ + RowBox[{"Association", "[", + RowBox[{ + RowBox[{"\"\\"", " ", "->", " ", + RowBox[{"ToString", "[", "x", "]"}]}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}]}], "]"}], ",", " ", + RowBox[{"extractImageMetadata", "[", + RowBox[{"res", "[", + RowBox[{"[", "1", "]"}], "]"}], "]"}]}], "]"}]}], ";", "\n", " ", + RowBox[{"metadata2", " ", "=", " ", + RowBox[{"Join", "[", + RowBox[{ + RowBox[{"Association", "[", + RowBox[{ + RowBox[{"\"\\"", " ", "->", " ", + RowBox[{"ToString", "[", + RowBox[{"x", " ", "+", " ", "1"}], "]"}]}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}], ",", " ", + RowBox[{"\"\\"", " ", "->", " ", "\"\\""}]}], "]"}], ",", " ", + RowBox[{"extractImageMetadata", "[", + RowBox[{"res", "[", + RowBox[{"[", "2", "]"}], "]"}], "]"}]}], "]"}]}]}]}], "]"}], ";", "\n", " ", + RowBox[{"AppendTo", "[", + RowBox[{"metadataList", ",", " ", "metadata1"}], "]"}], ";", "\n", " ", + RowBox[{"AppendTo", "[", + RowBox[{"metadataList", ",", " ", "metadata2"}], "]"}], ";", "\n", " ", RowBox[{"Export", "[", RowBox[{ RowBox[{"\"\\"", " ", "<>", " ", @@ -373,7 +479,15 @@ return img.src && img.complete && img.offsetWidth>0 && img.offsetHeight>0;});\ RowBox[{"res", "[", RowBox[{"[", "2", "]"}], "]"}]}], "]"}], ";", "\n", " ", RowBox[{"Remove", "[", "res", "]"}]}]}], "]"}], " ", "&"}], ",", - "\n", " ", "lista"}], "]"}]}]}], "\n", " ", "]"}]}]], "Input", + "\n", " ", "lista"}], "]"}], ";", "\n", " ", + RowBox[{"(*", " ", + RowBox[{"Export", " ", "metadata", " ", "to", " ", "CSV"}], " ", "*)"}], "\n", " ", + RowBox[{"Export", "[", + RowBox[{"\"\\"", ",", " ", "metadataList"}], "]"}], ";", "\n", " ", + RowBox[{"Print", "[", + RowBox[{"\"\\"", " ", "<>", " ", + RowBox[{"ToString", "[", + RowBox[{"Length", "[", "metadataList", "]"}], "]"}], " ", "<>", " ", "\"\< records\>\""}], "]"}]}]}], "\n", " ", "]"}]}]], "Input", CellChangeTimes->{{3.961980875015821*^9, 3.961980883823658*^9}, { 3.9619810994615*^9, 3.961981109756647*^9}, {3.962018269104084*^9, 3.9620182737440205`*^9}},ExpressionUUID->"7129dd94-e5f2-324b-b994-\