lilianweng · andr3w321 · Jan 5, 2018
diff --git a/data_fetcher.py b/data_fetcher.py
@@ -7,9 +7,9 @@
 import pandas as pd
 import random
 import time
-import urllib2
+import requests
+from requests.utils import requote_uri
 
-from BeautifulSoup import BeautifulSoup
 from datetime import datetime
 
 DATA_DIR = "data"
@@ -25,19 +25,19 @@ def _download_sp500_list():
     if os.path.exists(SP500_LIST_PATH):
         return
 
-    f = urllib2.urlopen(SP500_LIST_URL)
-    print "Downloading ...", SP500_LIST_URL
-    with open(SP500_LIST_PATH, 'w') as fin:
-        print >> fin, f.read()
+    r = requests.get(SP500_LIST_URL)
+    print("Downloading ...", SP500_LIST_URL)
+    with open(SP500_LIST_PATH, 'wb') as fin:
+        fin.write(r.content)
     return
 
 
 def _load_symbols():
     _download_sp500_list()
     df_sp500 = pd.read_csv(SP500_LIST_PATH)
-    df_sp500.sort('Market Cap', ascending=False, inplace=True)
+    df_sp500.sort_values('Market Cap', ascending=False, inplace=True)
     stock_symbols = df_sp500['Symbol'].unique().tolist()
-    print "Loaded %d stock symbols" % len(stock_symbols)
+    print("Loaded %d stock symbols" % len(stock_symbols))
     return stock_symbols
 
 
@@ -53,33 +53,34 @@ def fetch_prices(symbol, out_name):
     # Format today's date to match Google's finance history api.
     now_datetime = datetime.now().strftime("%b+%d,+%Y")
 
-    BASE_URL = "https://www.google.com/finance/historical?output=csv&q={0}&startdate=Jan+1%2C+1980&enddate={1}"
-    symbol_url = BASE_URL.format(
-        urllib2.quote(symbol),
-        urllib2.quote(now_datetime, '+')
-    )
-    print "Fetching {} ...".format(symbol)
-    print symbol_url
+    BASE_URL = "https://finance.google.com/finance/historical?output=csv&q={0}&startdate=Jan+1%2C+1980&enddate={1}"
+    symbol_url = requote_uri(BASE_URL.format(
+        symbol,
+        now_datetime
+    ))
+    print("Fetching {} ...".format(symbol))
+    print(symbol_url)
 
     try:
-        f = urllib2.urlopen(symbol_url)
-        with open(out_name, 'w') as fin:
-            print >> fin, f.read()
-    except urllib2.HTTPError:
-        print "Failed when fetching {}".format(symbol)
+        r = requests.get(symbol_url)
+        r.raise_for_status()
+        with open(out_name, 'wb') as fin:
+            fin.write(r.content)
+    except requests.exceptions.HTTPError as err:
+        print("Failed {} when fetching {}".format(err, symbol))
         return False
 
     data = pd.read_csv(out_name)
     if data.empty:
-        print "Remove {} because the data set is empty.".format(out_name)
+        print("Remove {} because the data set is empty.".format(out_name))
         os.remove(out_name)
     else:
         dates = data.iloc[:,0].tolist()
-        print "# Fetched rows: %d [%s to %s]" % (data.shape[0], dates[-1], dates[0])
+        print("# Fetched rows: %d [%s to %s]" % (data.shape[0], dates[-1], dates[0]))
 
     # Take a rest
     sleep_time = random.randint(*RANDOM_SLEEP_TIMES)
-    print "Sleeping ... %ds" % sleep_time
+    print("Sleeping ... %ds" % sleep_time)
     time.sleep(sleep_time)
     return True
 
@@ -97,14 +98,14 @@ def main(continued):
     for idx, sym in enumerate(symbols):
         out_name = os.path.join(DATA_DIR, sym + ".csv")
         if continued and os.path.exists(out_name):
-            print "Fetched", sym
+            print("Fetched", sym)
             continue
 
         succeeded = fetch_prices(sym, out_name)
         num_failure += int(not succeeded)
 
         if idx % 10 == 0:
-            print "# Failures so far [%d/%d]: %d" % (idx + 1, len(symbols), num_failure)
+            print("# Failures so far [%d/%d]: %d" % (idx + 1, len(symbols), num_failure))
 
 
 if __name__ == "__main__":

diff --git a/main.py b/main.py
@@ -52,15 +52,15 @@ def load_sp500(input_size, num_steps, k=None, target_symbol=None, test_ratio=0.0
     info = pd.read_csv("data/constituents-financials.csv")
     info = info.rename(columns={col: col.lower().replace(' ', '_') for col in info.columns})
     info['file_exists'] = info['symbol'].map(lambda x: os.path.exists("data/{}.csv".format(x)))
-    print info['file_exists'].value_counts().to_dict()
+    print(info['file_exists'].value_counts().to_dict())
 
     info = info[info['file_exists'] == True].reset_index(drop=True)
-    info = info.sort('market_cap', ascending=False).reset_index(drop=True)
+    info = info.sort_values('market_cap', ascending=False).reset_index(drop=True)
 
     if k is not None:
         info = info.head(k)
 
-    print "Head of S&P 500 info:\n", info.head()
+    print("Head of S&P 500 info:\n", info.head())
 
     # Generate embedding meta file
     info[['symbol', 'sector']].to_csv(os.path.join("logs/metadata.tsv"), sep='\t', index=False)

diff --git a/model_rnn.py b/model_rnn.py
@@ -171,9 +171,9 @@ def train(self, dataset_list, config):
         merged_test_y = np.array(merged_test_y)
         merged_test_labels = np.array(merged_test_labels)
 
-        print "len(merged_test_X) =", len(merged_test_X)
-        print "len(merged_test_y) =", len(merged_test_y)
-        print "len(merged_test_labels) =", len(merged_test_labels)
+        print("len(merged_test_X) =", len(merged_test_X))
+        print("len(merged_test_y) =", len(merged_test_y))
+        print("len(merged_test_labels) =", len(merged_test_labels))
 
         test_data_feed = {
             self.learning_rate: 0.0,
@@ -196,10 +196,10 @@ def train(self, dataset_list, config):
                 i for i, sym_label in enumerate(merged_test_labels)
                 if sym_label[0] == l])
             sample_indices[sym] = target_indices
-        print sample_indices
+        print(sample_indices)
 
-        print "Start training for stocks:", [d.stock_sym for d in dataset_list]
-        for epoch in xrange(config.max_epoch):
+        print("Start training for stocks:", [d.stock_sym for d in dataset_list])
+        for epoch in range(config.max_epoch):
             epoch_step = 0
             learning_rate = config.init_learning_rate * (
                 config.learning_rate_decay ** max(float(epoch + 1 - config.init_epoch), 0.0)
@@ -223,8 +223,8 @@ def train(self, dataset_list, config):
                     if np.mod(global_step, len(dataset_list) * 100 / config.input_size) == 1:
                         test_loss, test_pred = self.sess.run([self.loss, self.pred], test_data_feed)
 
-                        print "Step:%d [Epoch:%d] [Learning rate: %.6f] train_loss:%.6f test_loss:%.6f" % (
-                            global_step, epoch, learning_rate, train_loss, test_loss)
+                        print("Step:%d [Epoch:%d] [Learning rate: %.6f] train_loss:%.6f test_loss:%.6f" % (
+                            global_step, epoch, learning_rate, train_loss, test_loss))
 
                         # Plot samples
                         for sample_sym, indices in sample_indices.iteritems():