We will use BeautifulSoup to scrap data from http://boxofficemojo.com/ and http://allocine.fr/

In [1]:
import urllib2
from bs4 import BeautifulSoup

Let's define first some basic functions to convert scraped data into exploitable format

In [5]:
import dateutil.parser
import urllib

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$','').replace(',','')
        return int(moneystring)
    except:
        return None
    
def clear_url(raw_url):
    tmp_url = raw_url.replace('(','').replace(')','').replace('-',' ').replace('.','')
    tmp_url = tmp_url.replace(u'é','e').replace(u'â','a').replace(',','e').encode('latin1','ignore')
    return tmp_url

Loading from the international section of Mojo the tables with top raking movies by revenues from France, for years 2010 to 2014. For each year scan up to for pages with 100 movies per page. Rows contain, movie rank for a specific year, movie name (without the mojo url), French distributor, French gross and French release date (to which we will append the year).

In [4]:
data = []
url = "http://boxofficemojo.com/intl/france/yearly/?yr="

for scan in ['2014','2013','2012','2011','2010']:
    for scan_page in [str(x+1) for x in range(4)]:
        page = urllib2.urlopen(url+scan+'&pagenum='+scan_page)
        soup = BeautifulSoup(page)
        try:
            table = soup.find_all('table')
            data_table = table[4]
            rows = data_table.find_all('tr')
            for row in rows[1:]:
                cols = row.find_all('td')
                cols = [ele.text.strip() for ele in cols]
                cols[4]=cols[4]+'/'+scan
                data.append(cols)
        except IndexError:
            pass

To get more info from Mojo for each movie we will have to visit its specific Mojo page which we will find with the Mojo search engine. From the search result, we will pick the movie that has a US release date within 180 days from the French release date. The search engine gives information such as the Studio, US box office, US number of theaters as well as the revenue and number of theaters of the US opening week-end ; and once on the movie page, we can get information such as the Director of the movie.

In [10]:
data_with_url_and_release_dates = [['FrenchRank',
                                    'MovieName',
                                    'MojoUrl',
                                    'FrenchDistributor',
                                    'FrenchGross',
                                    'FrenchReleaseDate',
                                    'Studio',
                                    'USGross',
                                    'USNumberOfTheaters',
                                    'USOpening',
                                    'USOpeningTheaters',
                                    'USReleaseDate',
                                    'Director']]
for row in data:
    url = "http://boxofficemojo.com/search/?q="
    query = urllib.quote(clear_url(row[1]))
    try:
        page = urllib2.urlopen(url+query)
        soup = BeautifulSoup(page)
        content = []
        release_date_given = to_date(row[4])
        
        try:
            table = soup.find_all('table')
            link_table = table[4]
            rows = link_table.find_all('tr')
            for x in rows[1:]:
                cols = x.find_all('td')
                col_with_link = cols[0]
                col_with_release_date = cols[6]
                try:
                    release_date_searched = to_date(col_with_release_date.text)                    
                except ValueError:
                    pass
                delta = release_date_searched - release_date_given
                if abs(delta.days) < 180:
                    matching_url = 'http://boxofficemojo.com'+col_with_link.find('a')['href']
                    director = url_to_director(matching_url)
                    row_data = [int(row[0]), # french rank
                                row[1], # name of the movie
                                matching_url, # url
                                row[2], # distributor
                                money_to_int(row[3]), # french gross
                                release_date_given, # french release date
                                cols[1].text, # studio
                                money_to_int(cols[2].text), # us gross
                                money_to_int(cols[3].text), # us number of theaters
                                money_to_int(cols[4].text), # us opening
                                money_to_int(cols[5].text), # number of us opening theaters
                                release_date_searched, # us release date
                                director] # us director
                    data_with_url_and_release_dates.append(row_data)
                    break
        except:
            pass
    except:
        pass

Save data to a cvs file

In [40]:
import csv

with open('nasdag_luther_data.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    for row in data_with_url_and_release_dates:
        row = [c.encode('utf8') if isinstance(c, unicode) else c for c in row]
        writer.writerow(row)

Before scraping data from Allociné we need to define some functions.

The first one will use the search engine of Allociné to find the corresponding movie page of a movie name with a release date and a director.

In [7]:
def allocine(movie_name, release_date_given, director):
    url = "http://www.allocine.fr/recherche/?q="
    query = clear_url(movie_name.replace(' ','+'))
    page = urllib2.urlopen(url+query)
    soup = BeautifulSoup(page)

    try:
        table = soup.find_all('table')
        link_table = table[0]
        rows = link_table.find_all('tr')
        for x in rows:
            try:
                cols = x.find_all('td')
                col_with_link = cols[1]
                search_text = col_with_link.find(class_='fs11').text
                release_date_year = search_text.split()[0]
                if release_date_year == release_date_given and director in search_text:
                    fr_movie_url = 'http://www.allocine.fr'+col_with_link.find('a')['href']
                    return fr_movie_url
            except:
                pass
    except:
        pass
    return None

This function will locate the rating and the number of entries in a given Allociné movie page

In [8]:
def french_press_rating_with_entries(fr_movie_link):
    page = urllib2.urlopen(fr_movie_link)
    soup = BeautifulSoup(page)
    try:
        note = soup.find(class_='note').text.replace(',','.')
        entries = "".join(soup.find_all(class_="visible")[1].find_all('td')[2].text.split()[:-1])
        return float(note), int (entries)
    except:
        pass
    return None, None

This function will locate the Director name in a given Mojo movie page (used during data scraping from Mojo)

In [10]:
def url_to_director(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    
    cells = soup.find_all('td')
    
    previous = cells[0]
    for cell in cells[1:]:
        if previous.find(text='Director:') or previous.find(text='Directors:') or previous.find(text='Producers:'):
            return cell.find_all('a')[0].text
        previous = cell
    return ''

Let's now load the date of the previous data scraping phase

In [11]:
import csv

cr = csv.reader(open('nasdag_luther_data.csv','rb'))
data_with_url_and_release_dates = []
for row in cr:
    data_with_url_and_release_dates.append(row)

We can run the data scraping script on Allociné and append the French rating and number of entries to each movie

In []:
from dateutil import parser

data_with_allocine = []

row0 = data_with_url_and_release_dates[0]
row0.append('FrRating')
row0.append('FrEntries')

data_with_allocine.append(row0)

for row in data_with_url_and_release_dates[1:]:
    try:
        french_url = allocine(row[1], str(parser.parse(row[5]).year), row[12])
        fr_rating, fr_entries = french_press_rating_with_entries(french_url)
        row_data = row
        row_data.append(fr_rating)
        row_data.append(fr_entries)
        data_with_allocine.append(row_data)
    except:
        pass

And finally save the complete data to a cvs file

In [13]:
with open('nasdag_luther_data_with_french_rating.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    for row in data_with_url_and_release_dates:
        row = [c.encode('utf8') if isinstance(c, unicode) else c for c in row]
        writer.writerow(row)