Sabin Sapkota

Sabin Sapkota

  • NA
  • 3
  • 3.6k

Scrape html and sort the data in excel(c#)

Feb 11 2015 2:40 PM
Hello, I am trying to scrape a website(its a public data) and store it in excel or some other database. I am new to all this. So I was able to download the html source to a text or excel but the data is very unorganized. Basically, I wanted to organize the data into some readable format. Following are the things I am trying to do:
1) Get data from the website with <div id="container"> and there are links within them. So I want to go to all the links and fetch data from there.
2). The collected data should be readable and formatted.
I could get the contents of the first page but could not get into the links. Could you please suggest how I should go. I looked up and found that htmlagilitypack is a way, but I have never used it before and I am stuck. I have included the codes that I have done so far.
Thank you
This is my form class:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Net;
using System.IO;



namespace tryScrape1
{
  public partial class Form1 : Form
{
  public Form1()
{
  InitializeComponent();
}

  private void button1_Click(object sender, EventArgs e)
{

        //If a correct url is entered in the textbox
            try
            {

            //Gets the url entered by the user from the textbox
                string url = textBox1.Text;


            //Setting up the path for scrapped data
            string directory = @"c:\temp\";
                string filename = String.Format("scrapped_data.xls", DateTime.Now);
                string path = Path.Combine(directory, filename);

            //Class variable declaration
                string sourceCode = GetSource.getSourceCode(url);

            //Marks the start point of scrape
                int startIndex = sourceCode.IndexOf("paddingbig");

            //Marks the endpoint of the html to scrape
                int endIndex = sourceCode.IndexOf("321,820");

          //Gets the string between the specified startIndex and endIndex
  sourceCode = sourceCode.Substring(startIndex, endIndex - startIndex);


          //Request made to the url to access
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                StreamWriter sWriter = new StreamWriter(path);

                sWriter.Write(sourceCode);
  MessageBox.Show("Contents have been Scrapped!");
  textBox1.Clear();
                sWriter.Close();

                }
              //if the textbox is blank or incorrect url or if a url cannot be scrapped
          catch(Exception)
                {
                    MessageBox.Show("URL input cannot be blank.");
                }
}
}}

GetSource Class:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.IO;

namespace tryScrape1
{
    class GetSource
    {
        public static string getSourceCode(string url)
        {
           
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            StreamReader streamReader = new StreamReader(response.GetResponseStream());
            string sourceCode = streamReader.ReadToEnd();
            streamReader.Close();
            response.Close();
            return sourceCode;
     }

}
}


Answers (2)