Skip to content Skip to sidebar Skip to footer

What Is An Acceptable Way To Discard The First Couple Lines Of Text?

Using the requests library, I am scraping lines of text off the SEC.gov website for a personal project. I get an error because I am trying to assign to variables before the correct

Solution 1:

You expect a total of 5 columns. Ignore every line that does not have 5 columns.

import requests

def get_index(year, quarter):
    url ='https://www.sec.gov/Archives/edgar/full-index/%s/%s/master.idx'% (year, quarter)
    r = requests.get(url)

    for line in r.text.splitlines():
        row= line.split('|')
        if len(row) ==5:
            yield rowrows= get_index(2018, 'QTR1')
next(rows)  # skip header rowfor i, rowin enumerate(rows):
    print(row)
    if i >10:
        break

Solution 2:

import re
import requests
import pandas as pd

defget_data(url):
    r = requests.get(url)
    r.raise_for_status()

    # Find the csv header
    m1 = re.search("\\n(\w\s*\|?)+\\n", r.text)

    # Find end of dash line starting from end of header
    start = r.text.find("\n", m1.end()) + 1# r.text[start:] is the part of the text after the initial header# Get individual lines
    lines = r.text[start:].splitlines()

    # If you have Pandas, you can pack everything into a nice DataFrame    
    cols = m1.group().strip().split('|')
    df = pd.DataFrame([line.split('|') for line in lines], columns=cols)
    return df

url = 'https://www.sec.gov/Archives/edgar/full-index/2019/QTR1/master.idx'
df = get_data(url)
df.head()

gives

       CIK            Company Name Form Type  Date Filed                                     Filename
0  1000045  NICHOLAS FINANCIAL INC      10-Q  2019-02-14  edgar/data/1000045/0001193125-19-039489.txt
1  1000045  NICHOLAS FINANCIAL INC         4  2019-01-15  edgar/data/1000045/0001357521-19-000001.txt
2  1000045  NICHOLAS FINANCIAL INC         4  2019-02-19  edgar/data/1000045/0001357521-19-000002.txt
3  1000045  NICHOLAS FINANCIAL INC         4  2019-03-15  edgar/data/1000045/0001357521-19-000003.txt
4  1000045  NICHOLAS FINANCIAL INC       8-K  2019-02-01  edgar/data/1000045/0001193125-19-024617.txt

Solution 3:

You could look for line of only "-" then take rows after

import requests
import pandas as pd
url = 'https://www.sec.gov/Archives/edgar/full-index/2018/QTR1/master.idx'
r = requests.get(url).text
records = r.splitlines()
results = []
header = 'CIK|Company Name|Form Type|Date Filed|Filename'
found = Falsefor row in records:
    if found:
        results.append(row.split('|'))
    ifnot found andset(row.strip()) == {'-'}:
        found = True
df = pd.DataFrame(results, columns = header.split('|') )
print(df.head())

Post a Comment for "What Is An Acceptable Way To Discard The First Couple Lines Of Text?"