{
  "name": "ScrapingBee and Google Sheets integration template",
  "nodes": [
    {
      "id": "cc29176e-71a0-435d-b880-4c972642b6dd",
      "name": "Load the xml file as JSON",
      "type": "n8n-nodes-base.extractFromFile",
      "position": [
        1792,
        48
      ]
    },
    {
      "id": "2671b101-8196-4c2a-881b-2326af6f05d7",
      "name": "If it's a binary file",
      "type": "n8n-nodes-base.if",
      "position": [
        896,
        144
      ]
    },
    {
      "id": "f0a656d3-31e3-4a83-a5b9-ea7e6647f9eb",
      "name": "Domain to scrape",
      "type": "n8n-nodes-base.webhook",
      "position": [
        0,
        264
      ]
    },
    {
      "id": "66346ed3-4ce8-4a50-a379-61280b7dcd9d",
      "name": "Scrape robots.txt file",
      "type": "n8n-nodes-scrapingbee.ScrapingBee",
      "position": [
        224,
        264
      ]
    },
    {
      "id": "2fe3fd6d-1105-49e2-8c13-aae809b9a745",
      "name": "Scrape sitemap.xml file",
      "type": "n8n-nodes-scrapingbee.ScrapingBee",
      "position": [
        672,
        144
      ]
    },
    {
      "id": "51f47d4e-f7f3-4578-8516-8a8bb03f2123",
      "name": "If sitemap links are found",
      "type": "n8n-nodes-base.if",
      "position": [
        448,
        480
      ]
    },
    {
      "id": "bbd9751f-3f8f-4873-bd5b-54d519b1738a",
      "name": "If it's a .gz file",
      "type": "n8n-nodes-base.if",
      "position": [
        1120,
        48
      ]
    },
    {
      "id": "bfa892a6-7215-4503-99ae-6da5ed012029",
      "name": "Decompress .gz file",
      "type": "n8n-nodes-base.compression",
      "position": [
        1344,
        144
      ]
    },
    {
      "id": "85e6ae16-6b30-4241-a762-5fdc3381b6f8",
      "name": "Store the file to data key for easy handling",
      "type": "n8n-nodes-base.code",
      "position": [
        1568,
        144
      ]
    },
    {
      "id": "fc8d55c1-ad13-4384-bd7a-06c05c83bb42",
      "name": "Extract non-xml links",
      "type": "n8n-nodes-base.code",
      "position": [
        2016,
        -96
      ]
    },
    {
      "id": "813a432d-1ad5-4e52-9319-9f76352f03d4",
      "name": "Extract xml links",
      "type": "n8n-nodes-base.code",
      "position": [
        2016,
        240
      ]
    },
    {
      "id": "d0e312fd-d25e-4af9-998e-819e80d6689a",
      "name": "Scrape xml file",
      "type": "n8n-nodes-scrapingbee.ScrapingBee",
      "position": [
        2240,
        336
      ]
    },
    {
      "id": "e2a98270-8996-4321-94ff-9b9dfef8cb7b",
      "name": "Append links to sheet",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        2240,
        -96
      ]
    },
    {
      "id": "a8d2c3f8-a711-43bf-96d5-6bbbb01ac9c5",
      "name": "Sticky Note",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -336,
        208
      ],
      "parameters": {
        "width": 272,
        "height": 208,
        "content": "## Input\n\nYou need to send a webhook request with domain as query parameter.\n\nFor example:\n`https://<webhook_link>?domain=n8n.io`"
      }
    },
    {
      "id": "5851dd9e-24a6-4842-9699-cbbd322d6f2f",
      "name": "Sticky Note1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        144,
        432
      ],
      "parameters": {
        "width": null,
        "height": 144,
        "content": "## Scrape Robots.txt\n\nMost websites provide sitemap links in robots.txt file so we will scrape it first"
      }
    },
    {
      "id": "09c2c465-ff2a-44a4-9536-8dc59290c07d",
      "name": "Sticky Note2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        608,
        -80
      ],
      "parameters": {
        "width": null,
        "height": 176,
        "content": "## Scrape Sitemap.xml\n\nIn case sitemap links are missing in robots.txt file, we will try to scrape sitemap.xml file"
      }
    },
    {
      "id": "8939e3bc-458c-414b-94c1-032c229ce369",
      "name": "Sticky Note3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        384,
        640
      ],
      "parameters": {
        "width": null,
        "height": 80,
        "content": "If sitemap links are are available, we will directly extract the xml links"
      }
    },
    {
      "id": "82e794a5-3e7d-4d33-951f-06930423b647",
      "name": "Sticky Note4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        880,
        288
      ],
      "parameters": {
        "width": null,
        "height": 96,
        "content": "Sometimes links are received as text content and sometimes they are received as binary, so we need to check for that."
      }
    },
    {
      "id": "a16d3039-8c33-4a2f-908c-1d0c8b2504d1",
      "name": "Sticky Note5",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1408,
        -192
      ],
      "parameters": {
        "width": null,
        "height": 176,
        "content": "If it's a .xml.gz file, we need to decompress it. We are also renaming the key because by default they are named `file_0` and we need it to be named as `data` so that we can use a single extraction lo"
      }
    },
    {
      "id": "6917c1a1-5b0a-4e68-a9d2-f3f7e4bf0de2",
      "name": "Sticky Note6",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        2192,
        64
      ],
      "parameters": {
        "width": null,
        "height": 256,
        "content": "## Add Links to Sheet and Scrape XML Links\n\nIf the xml file contains normal links they are extracted and added to sheet. And if it contains other `.xml` links, we will scrape them. Basically, this is "
      }
    },
    {
      "id": "c5c6585a-efe5-44ef-9a78-43651308e5ce",
      "name": "Sticky Note7",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        2192,
        -208
      ],
      "parameters": {
        "width": null,
        "height": 80,
        "content": "Connect to a Google Sheet and add `links` as column name"
      }
    },
    {
      "id": "ffba7362-954c-4274-ac21-4ecf67e9a536",
      "name": "Sticky Note8",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -304,
        -80
      ],
      "parameters": {
        "width": 800,
        "height": 112,
        "content": "## NOTE\nSome heavy sitemaps could result in a crash if the workflow consumes more memory than what is available in your n8n plan or self-hosted system. If this happens, we would recommend you to eithe"
      }
    }
  ],
  "connections": {
    "Scrape xml file": {
      "main": [
        [
          {
            "node": "If it's a binary file",
            "type": "main",
            "index": 0
          }
        ],
        []
      ]
    },
    "Domain to scrape": {
      "main": [
        [
          {
            "node": "Scrape robots.txt file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract xml links": {
      "main": [
        [
          {
            "node": "Scrape xml file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If it's a .gz file": {
      "main": [
        [
          {
            "node": "Decompress .gz file",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Load the xml file as JSON",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Decompress .gz file": {
      "main": [
        [
          {
            "node": "Store the file to data key for easy handling",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Append links to sheet": {
      "main": [
        []
      ]
    },
    "Extract non-xml links": {
      "main": [
        [
          {
            "node": "Append links to sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If it's a binary file": {
      "main": [
        [
          {
            "node": "If it's a .gz file",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Extract non-xml links",
            "type": "main",
            "index": 0
          },
          {
            "node": "Extract xml links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape robots.txt file": {
      "main": [
        [
          {
            "node": "If sitemap links are found",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Scrape sitemap.xml file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape sitemap.xml file": {
      "main": [
        [
          {
            "node": "If it's a binary file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Load the xml file as JSON": {
      "main": [
        [
          {
            "node": "Extract non-xml links",
            "type": "main",
            "index": 0
          },
          {
            "node": "Extract xml links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If sitemap links are found": {
      "main": [
        [
          {
            "node": "Extract xml links",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Scrape sitemap.xml file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store the file to data key for easy handling": {
      "main": [
        [
          {
            "node": "Load the xml file as JSON",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}