i just wanted to come back for a moment to share a Power Automate Desktop flow that can be used to get this random set of observations that i noted before. as defined, the flow uses Edge as the browser, but it can be adapted for Firefox or Chrome. it can also be adapted to use different filter parameters, request different numbers of records/pages, or get data from other jumear…/stirfry/iNatv1API_xxx.html pages. the flow dumps the data into Excel, but the flow can be adapted to export to CSV or some other format.
to use the flow, simply copy the code below and paste it into a new flow in Power Automate Desktop.
# This flow contains the basic structure to extract data from most /stirfry/iNatAPIv1_xxx.html pages and then open the data in an Excel spreadsheet. In an ideal world, a data extraction flow would just need 2 steps -- one to open the browser, and another to extract the data, handle pagination, and export to Excel. However, the /stirfry pages load a basic skeleton first and then add data based on the response from an API request. This delay between initial load and API response can cause issues issues for the the standard data extraction step, since there's no mechanism to force it to wait for the API request to complete. So this flow's structure allows for such a wait, in part, by handling pagination and data export separately from the data extraction action.
SET urlBase TO $'''https://jumear.github.io/stirfry/iNatAPIv1_observations.html?quality_grade=needs_id&options=idextra&order_by=random'''
SET pageFirst TO 1
SET pageLast TO 10
SET perPage TO 200
SET delayBeforeExtract TO 1
SET delayMaxRetry TO 20
# dataExtracted is a data table variable that will store the combined results extracted from each page. It is initialized first with column header labels. These labels need to be set to match the fields defined in the main data extraction step.
SET dataExtracted TO { ^['Obs ID', 'Obs URL', 'Taxon', 'Common', 'Rank', 'Grade', 'ID Count', 'ID Count @ Obs', 'ID Taxa @ Obs', 'ID Taxa @ Ansc', 'ID Taxa @ Desc', 'ID Taxa @ Other', 'Obs Taxon = Community Taxon', 'Obs Date', 'Sub Date'] }
Variables.CreateNewList List=> pagesNotExtracted
WebAutomation.LaunchEdge.LaunchEdge Url: $'''%urlBase%&per_page=%perPage%&page=%pageFirst%''' WindowState: WebAutomation.BrowserWindowState.Normal ClearCache: False ClearCookies: False Timeout: 60 BrowserInstance=> Browser
LOOP pageCurr FROM pageFirst TO pageLast STEP 1
LOOP LoopIndex FROM 1 TO delayMaxRetry STEP 1
WAIT delayBeforeExtract
# When the page gets a response from the API, it will add a paragraph <p> to the body of the page which displays either error messages returned from the API or some summary information about the data returned. So if this <p> is found, data extraction can begin. Otherwise, wait again before retrying (up to the maximum number of retries).
IF (WebAutomation.IfWebPageContains.WebPageContainsElement BrowserInstance: Browser Control: appmask['WebPage']['pInfo']) THEN
# This is the main data extraction step. Note that it is not set up to handle pagination, since pagination is handled by the rest of the flow.
WebAutomation.ExtractData.ExtractTable BrowserInstance: Browser Control: $'''html > body > table > tbody > tr''' ExtractionParameters: {[$'''td:eq(1) > a''', $'''Own Text''', $'''%''%''', $'''Value #1'''], [$'''td:eq(1) > a''', $'''Href''', $'''%''%''', $'''Value #2'''], [$'''td:eq(3) > a''', $'''Own Text''', $'''%''%''', $'''Value #3'''], [$'''td:eq(4)''', $'''Own Text''', $'''%''%''', $'''Value #4'''], [$'''td:eq(5)''', $'''Own Text''', $'''%''%''', $'''Value #5'''], [$'''td:eq(6)''', $'''Own Text''', $'''%''%''', $'''Value #6'''], [$'''td:eq(7)''', $'''Own Text''', $'''%''%''', $'''Value #7'''], [$'''td:eq(8)''', $'''Own Text''', $'''%''%''', $'''Value #8'''], [$'''td:eq(9)''', $'''Own Text''', $'''%''%''', $'''Value #9'''], [$'''td:eq(10)''', $'''Own Text''', $'''%''%''', $'''Value #10'''], [$'''td:eq(11)''', $'''Own Text''', $'''%''%''', $'''Value #11'''], [$'''td:eq(12)''', $'''Own Text''', $'''%''%''', $'''Value #12'''], [$'''td:eq(13)''', $'''Own Text''', $'''%''%''', $'''Value #13'''], [$'''td:eq(16)''', $'''Own Text''', $'''%''%''', $'''Value #14'''], [$'''td:eq(17)''', $'''Own Text''', $'''%''%''', $'''Value #15'''] } ExtractedData=> DataFromWebPage
IF DataFromWebPage.RowsCount = 0 THEN
Variables.AddItemToList Item: pageCurr List: pagesNotExtracted NewList=> pagesNotExtracted
ELSE
LOOP FOREACH CurrentItem IN DataFromWebPage
SET dataExtracted TO dataExtracted + CurrentItem
END
END
EXIT LOOP
ELSE IF LoopIndex = delayMaxRetry THEN
Variables.AddItemToList Item: pageCurr List: pagesNotExtracted NewList=> pagesNotExtracted
END
END
IF (WebAutomation.IfWebPageContains.WebPageDoesNotContainElement BrowserInstance: Browser Control: appmask['WebPage']['nextPage']) THEN
EXIT LOOP
END
WebAutomation.Click.Click BrowserInstance: Browser Control: appmask['WebPage']['nextPage']
END
IF dataExtracted.RowsCount > 0 THEN
Excel.LaunchExcel.LaunchUnderExistingProcess Visible: True Instance=> ExcelInstance
Excel.WriteToExcel.WriteCell Instance: ExcelInstance Value: dataExtracted.ColumnHeadersRow Column: $'''A''' Row: 1
Excel.WriteToExcel.WriteCell Instance: ExcelInstance Value: dataExtracted Column: $'''A''' Row: 2
END
IF pagesNotExtracted.Count > 0 THEN
Text.JoinText.JoinWithCustomDelimiter List: pagesNotExtracted CustomDelimiter: $''', ''' Result=> pagesNotExtracted_CommaSeparated
Display.ShowMessageDialog.ShowMessage Title: $'''Extraction Issues''' Message: $'''No data extracted from these page numbers: %pagesNotExtracted_CommaSeparated%
Check the browser window to see if data exists or error messages were returned from the API.''' Icon: Display.Icon.ErrorIcon Buttons: Display.Buttons.OK DefaultButton: Display.DefaultButton.Button1 IsTopMost: True
END
# [ControlRepository][PowerAutomateDesktop]
{
"ApplicationInfo": {
"Name": "ClipboardControlRepository",
"Version": "1.0"
},
"Screens": [
{
"Controls": [
{
"AutomationProtocol": "uia3",
"ScreenShot": null,
"ElementTypeName": "<p>",
"Name": "pInfo",
"SelectorCount": 1,
"Selectors": [
{
"CustomSelector": " > body > p:eq(1)",
"Elements": [
{
"Attributes": [
{
"Ignore": true,
"IsOrdinal": false,
"Name": "Class",
"Operation": "EqualTo",
"Value": null
},
{
"Ignore": true,
"IsOrdinal": false,
"Name": "Id",
"Operation": "EqualTo",
"Value": null
},
{
"Ignore": true,
"IsOrdinal": true,
"Name": "Ordinal",
"Operation": "EqualTo",
"Value": "-1"
},
{
"Ignore": true,
"IsOrdinal": false,
"Name": "Title",
"Operation": "EqualTo",
"Value": null
}
],
"CustomValue": null,
"Ignore": false,
"Name": "<body>",
"Tag": "body"
},
{
"Attributes": [
{
"Ignore": true,
"IsOrdinal": false,
"Name": "Class",
"Operation": "EqualTo",
"Value": null
},
{
"Ignore": true,
"IsOrdinal": false,
"Name": "Id",
"Operation": "EqualTo",
"Value": null
},
{
"Ignore": false,
"IsOrdinal": true,
"Name": "Ordinal",
"Operation": "EqualTo",
"Value": "1"
},
{
"Ignore": true,
"IsOrdinal": false,
"Name": "Title",
"Operation": "EqualTo",
"Value": null
}
],
"CustomValue": null,
"Ignore": false,
"Name": "<p>",
"Tag": "p"
}
],
"Ignore": false,
"IsCustom": true,
"IsWindowsInstance": false,
"Order": 0
}
],
"Tag": "p"
},
{
"AutomationProtocol": null,
"ScreenShot": null,
"ElementTypeName": "a",
"Name": "nextPage",
"SelectorCount": 1,
"Selectors": [
{
"CustomSelector": "a[Id=\"button_next\"]",
"Elements": [],
"Ignore": false,
"IsCustom": true,
"IsWindowsInstance": false,
"Order": 0
}
],
"Tag": "a"
}
],
"ScreenShot": null,
"ElementTypeName": "Web Page",
"Name": "WebPage",
"SelectorCount": 1,
"Selectors": [
{
"CustomSelector": ":desktop > domcontainer",
"Elements": [
{
"Attributes": [],
"CustomValue": "domcontainer",
"Ignore": false,
"Name": "Web Page",
"Tag": "domcontainer"
}
],
"Ignore": false,
"IsCustom": false,
"IsWindowsInstance": false,
"Order": 0
}
],
"Tag": "domcontainer"
}
],
"Version": 1
}