i think the process would vary slightly depending on what kind of tools you have access to.
since i have Windows and Microsoft Office, the general process i would have used to handle this, given ~450 records (3 pages of results) from https://jumear.github.io/stirfry/iNatAPIv1_identifications?taxon_id=1024476&rank=subgenus&taxon_active=false¤t=false&disagreement=true&observation_hrank=species&per_page=200, would have been:
- open the above URL in your favorite browser
- copy the table section of the page without the headers
- open up Excel
- click the first cell in the first blank row. paste special (text only).
- go back to your browser and click the button on the page to get the next page of results.
- repeat 2-5 above, as needed.
- select and copy the column that represents the observation id
- open up Word
- in a blank document, paste special (text only).
- find all / replace (ctrl+H) : find all instances of
^p
(return) and replace each with ,
(comma)
- eliminate the last comma in the resulting comma-separated list of ids
if there were a lot more records (many more pages of results), then i probably would have used Power Automate Desktop (included with Windows 11) to automate steps 1-6 above. you can copy the script below and paste it into your own new flow to do the same. (this particular script uses the Edge browser and outputs to Excel, but you could easily change it to use the Chrome or Firefox browser, and then output to a CSV.)
# This flow contains the basic structure to extract data from most /stirfry/iNatAPIv1_xxx.html pages and then open the data in an Excel spreadsheet. In an ideal world, a data extraction flow would just need 2 steps -- one to open the browser, and another to extract the data, handle pagination, and export to Excel. However, the /stirfry pages load a basic skeleton first and then add data based on the response from an API request. This delay between initial load and API response can cause issues issues for the the standard data extraction step, since there's no mechanism to force it to wait for the API request to complete. So this flow's structure allows for such a wait, in part, by handling pagination and data export separately from the data extraction action.
SET urlBase TO $'''https://jumear.github.io/stirfry/iNatAPIv1_identifications.html?taxon_id=1024476&rank=subgenus&taxon_active=false¤t=false&disagreement=true&observation_hrank=species'''
SET pageFirst TO 1
SET pageLast TO 50
SET perPage TO 200
SET delayBeforeExtract TO 1
SET delayMaxRetry TO 20
# dataExtracted is a data table variable that will store the combined results extracted from each page. It is initialized first with column header labels. These labels need to be set to match the fields defined in the main data extraction step.
SET dataExtracted TO { ^['Obs ID', 'Obs URL', 'Obs Grade', 'Obs User Login', 'Obs Observed', 'Obs Submitted', 'Obs Taxon Name', 'Obs Taxon Common Name', 'ID #', 'ID Taxon Name', 'ID Taxon Common Name', 'ID User Login', 'ID Datetime', 'ID Category'] }
Variables.CreateNewList List=> pagesNotExtracted
WebAutomation.LaunchEdge.LaunchEdge Url: $'''%urlBase%&per_page=%perPage%&page=%pageFirst%''' WindowState: WebAutomation.BrowserWindowState.Normal ClearCache: False ClearCookies: False Timeout: 60 BrowserInstance=> Browser
LOOP pageCurr FROM pageFirst TO pageLast STEP 1
LOOP LoopIndex FROM 1 TO delayMaxRetry STEP 1
WAIT delayBeforeExtract
# When the page gets a response from the API, it will add a paragraph <p> to the body of the page which displays either error messages returned from the API or some summary information about the data returned. So if this <p> is found, data extraction can begin. Otherwise, wait again before retrying (up to the maximum number of retries).
IF (WebAutomation.IfWebPageContains.WebPageContainsElement BrowserInstance: Browser Control: appmask['WebPage']['pInfo']) THEN
# This is the main data extraction step. Note that it is not set up to handle pagination, since pagination is handled by the rest of the flow.
WebAutomation.ExtractData.ExtractTable BrowserInstance: Browser Control: $'''html > body > table > tbody > tr''' ExtractionParameters: {[$'''td:eq(1) > a''', $'''Own Text''', $'''%''%''', $'''Value #1'''], [$'''td:eq(1) > a''', $'''Href''', $'''%''%''', $'''Value #2'''], [$'''td:eq(3)''', $'''Own Text''', $'''%''%''', $'''Value #3'''], [$'''td:eq(5)''', $'''Own Text''', $'''%''%''', $'''Value #4'''], [$'''td:eq(6)''', $'''Own Text''', $'''%''%''', $'''Value #5'''], [$'''td:eq(7)''', $'''Own Text''', $'''%''%''', $'''Value #6'''], [$'''td:eq(9)''', $'''Own Text''', $'''%''%''', $'''Value #7'''], [$'''td:eq(10)''', $'''Own Text''', $'''%''%''', $'''Value #8'''], [$'''td:eq(11)''', $'''Own Text''', $'''%''%''', $'''Value #9'''], [$'''td:eq(13)''', $'''Own Text''', $'''%''%''', $'''Value #10'''], [$'''td:eq(14)''', $'''Own Text''', $'''%''%''', $'''Value #11'''], [$'''td:eq(16)''', $'''Own Text''', $'''%''%''', $'''Value #12'''], [$'''td:eq(17)''', $'''Own Text''', $'''%''%''', $'''Value #13'''], [$'''td:eq(18)''', $'''Own Text''', $'''%''%''', $'''Value #14'''] } PostProcessData: True ExtractedData=> DataFromWebPage
IF DataFromWebPage.RowsCount = 0 THEN
Variables.AddItemToList Item: pageCurr List: pagesNotExtracted
ELSE
LOOP FOREACH CurrentItem IN DataFromWebPage
SET dataExtracted TO dataExtracted + CurrentItem
END
END
EXIT LOOP
ELSE IF LoopIndex = delayMaxRetry THEN
Variables.AddItemToList Item: pageCurr List: pagesNotExtracted
END
END
IF (WebAutomation.IfWebPageContains.WebPageDoesNotContainElement BrowserInstance: Browser Control: appmask['WebPage']['nextPage']) THEN
EXIT LOOP
END
WebAutomation.Click.Click BrowserInstance: Browser Control: appmask['WebPage']['nextPage']
END
IF dataExtracted.RowsCount > 0 THEN
Excel.LaunchExcel.LaunchUnderExistingProcess Visible: True Instance=> ExcelInstance
Excel.WriteToExcel.WriteCell Instance: ExcelInstance Value: dataExtracted.ColumnHeadersRow Column: $'''A''' Row: 1
Excel.WriteToExcel.WriteCell Instance: ExcelInstance Value: dataExtracted Column: $'''A''' Row: 2
END
IF pagesNotExtracted.Count > 0 THEN
Text.JoinText.JoinWithCustomDelimiter List: pagesNotExtracted CustomDelimiter: $''', ''' Result=> pagesNotExtracted_CommaSeparated
Display.ShowMessageDialog.ShowMessage Title: $'''Extraction Issues''' Message: $'''No data extracted from these page numbers: %pagesNotExtracted_CommaSeparated%
Check the browser window to see if data exists or error messages were returned from the API.''' Icon: Display.Icon.ErrorIcon Buttons: Display.Buttons.OK DefaultButton: Display.DefaultButton.Button1 IsTopMost: True
END
# [ControlRepository][PowerAutomateDesktop]
{
"ControlRepositorySymbols": [
{
"Name": "appmask",
"ImportMetadata": {
"DisplayName": "Computer",
"ConnectionString": "",
"Type": "Local"
},
"Repository": "{\r\n \"Screens\": [\r\n {\r\n \"Controls\": [\r\n {\r\n \"AutomationProtocol\": \"uia3\",\r\n \"ScreenShot\": null,\r\n \"ElementTypeName\": \"<p>\",\r\n \"Name\": \"pInfo\",\r\n \"SelectorCount\": 1,\r\n \"Selectors\": [\r\n {\r\n \"CustomSelector\": \" > body > p:eq(1)\",\r\n \"Elements\": [\r\n {\r\n \"Attributes\": [\r\n {\r\n \"Ignore\": true,\r\n \"IsOrdinal\": false,\r\n \"Name\": \"Class\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": null\r\n },\r\n {\r\n \"Ignore\": true,\r\n \"IsOrdinal\": false,\r\n \"Name\": \"Id\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": null\r\n },\r\n {\r\n \"Ignore\": true,\r\n \"IsOrdinal\": true,\r\n \"Name\": \"Ordinal\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": \"-1\"\r\n },\r\n {\r\n \"Ignore\": true,\r\n \"IsOrdinal\": false,\r\n \"Name\": \"Title\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": null\r\n }\r\n ],\r\n \"CustomValue\": null,\r\n \"Ignore\": false,\r\n \"Name\": \"<body>\",\r\n \"Tag\": \"body\"\r\n },\r\n {\r\n \"Attributes\": [\r\n {\r\n \"Ignore\": true,\r\n \"IsOrdinal\": false,\r\n \"Name\": \"Class\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": null\r\n },\r\n {\r\n \"Ignore\": true,\r\n \"IsOrdinal\": false,\r\n \"Name\": \"Id\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": null\r\n },\r\n {\r\n \"Ignore\": false,\r\n \"IsOrdinal\": true,\r\n \"Name\": \"Ordinal\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": \"1\"\r\n },\r\n {\r\n \"Ignore\": true,\r\n \"IsOrdinal\": false,\r\n \"Name\": \"Title\",\r\n \"Operation\": \"EqualTo\",\r\n \"Value\": null\r\n }\r\n ],\r\n \"CustomValue\": null,\r\n \"Ignore\": false,\r\n \"Name\": \"<p>\",\r\n \"Tag\": \"p\"\r\n }\r\n ],\r\n \"Ignore\": false,\r\n \"IsCustom\": true,\r\n \"IsWindowsInstance\": false,\r\n \"Order\": 0,\r\n \"Name\": \"Selector 1\"\r\n }\r\n ],\r\n \"Tag\": \"p\",\r\n },\r\n {\r\n \"AutomationProtocol\": null,\r\n \"ScreenShot\": null,\r\n \"ElementTypeName\": \"a\",\r\n \"Name\": \"nextPage\",\r\n \"SelectorCount\": 1,\r\n \"Selectors\": [\r\n {\r\n \"CustomSelector\": \"a[Id=\\\"button_next\\\"]\",\r\n \"Elements\": [],\r\n \"Ignore\": false,\r\n \"IsCustom\": true,\r\n \"IsWindowsInstance\": false,\r\n \"Order\": 0,\r\n \"Name\": \"Selector 1\"\r\n }\r\n ],\r\n \"Tag\": \"a\",\r\n \"ScreenshotPath\": \"controlRepo-screenshots\\\\\\\\6cafce73-16e7-445e-b8fa-292a7d877708.png\"\r\n }\r\n ],\r\n \"ScreenShot\": null,\r\n \"ElementTypeName\": \"Web Page\",\r\n \"Name\": \"WebPage\",\r\n \"SelectorCount\": 1,\r\n \"Selectors\": [\r\n {\r\n \"CustomSelector\": \":desktop > domcontainer\",\r\n \"Elements\": [\r\n {\r\n \"Attributes\": [],\r\n \"CustomValue\": \"domcontainer\",\r\n \"Ignore\": false,\r\n \"Name\": \"Web Page\",\r\n \"Tag\": \"domcontainer\"\r\n }\r\n ],\r\n \"Ignore\": false,\r\n \"IsCustom\": false,\r\n \"IsWindowsInstance\": false,\r\n \"Order\": 0,\r\n \"Name\": \"Selector 1\"\r\n }\r\n ],\r\n \"Tag\": \"domcontainer\",\r\n }\r\n ],\r\n \"Version\": 1\r\n}"
}
],
"ImageRepositorySymbol": {
"Name": "imgrepo",
"ImportMetadata": {},
"Repository": "{\r\n \"Folders\": [],\r\n \"Images\": [],\r\n \"Version\": 1\r\n}"
},
"ConnectionReferences": []
}