added source

yuce · yuce · commit dea6cc7544e7 · 2017-03-29T10:52:26.000+03:00
diff --git a/README.md b/README.md
@@ -1,6 +1,19 @@
 # Getting Started
 
-## Schema
+## Usage
+
+```
+pip install -r requirements.txt
+python fetch.py
+```
+
+### Sample Project
+
+In order to get a glance of the capabilities of Pilosa, we will write a sample project called "Star Trace". The database for the project will contain information about most recently updated 1000 Github projects which have "Austin" in their names, including stargazers, programming languages used and tags. People who have starred a project are called that project's stargazers.
+
+Although Pilosa doesn't keep the data in a tabular format, we will still use "columns" and "rows" when we talk about organizing our data. A common convention is putting the main focus (or subject) of a database in the columns, and properties of the subject in the rows. For instance, the columns of the "project" database would contain project IDs and the programming language(s) used for that project would be placed in the rows of the "language" *frame*.
+
+#### Schema
 
 ```
 project
@@ -13,12 +26,6 @@ project
 		row: language_id
 ```
 
-### Sample Project
-
-In order to get a glance of the capabilities of Pilosa, we will write a sample project called "Star Trace". The database for the project will contain information about most recently updated 1000 Github projects which have "Austin" in their names, including stargazers, programming languages used and tags. People who have starred a project are called that project's stargazers.
-
-Although Pilosa doesn't keep the data in a tabular format, we will still use "columns" and "rows" when we talk about organizing our data. A common convention is putting the main focus (or subject) of a database in the columns, and properties of the subject in the rows. For instance, the columns of the "project" database would contain project IDs and the programming language(s) used for that project would be placed in the rows of the "language" *frame*.
-
 #### Create the Schema
 
 Before we can import data or run queries, we need to create the schema for our databases. Let's create the project database first:
diff --git a/fetch.py b/fetch.py
@@ -0,0 +1,77 @@
+
+import os
+from github import Github
+
+TIME_FORMAT = "%Y-%m-%dT%H:%S"
+
+
+class StarTrace:
+
+    def __init__(self, path=os.getcwd(),  token=None):
+        self.path = path
+        self.token = token
+        # external ID to project ID (internal)
+        self.e2p = {}
+        # external ID to stargazer ID (internal)
+        self.e2s = {}
+        # language to language ID (internal)
+        self.e2l = {}
+
+    def search(self, query):
+        stargazer_frame = open(self.get_path("project-stargazer.csv"), "w")
+        language_frame = open(self.get_path("project-language.csv"), "w")
+
+        try:
+            gh = Github(self.token)
+            search = gh.search_repositories(query, sort='stars')
+            for i, repo in enumerate(search):
+                print(i, repo.id)
+                project_id = self.add_or_get_project(repo.id)
+                for lang in repo.get_languages().keys():
+                    language_frame.write("{lang_id},{project_id}\n".format(
+                        lang_id=self.add_or_get_language(lang),
+                        project_id=project_id
+                    ))
+                for stargazer in repo.get_stargazers_with_dates():
+                    stargazer_frame.write("{stargazer_id},{project_id},{starred_at}\n".format(
+                        stargazer_id=self.add_or_get_stargazer(stargazer.user.id),
+                        project_id=project_id,
+                        starred_at=stargazer.starred_at.strftime("%Y-%m-%dT%H:%S")
+                    ))
+        finally:
+            stargazer_frame.close()
+            language_frame.close()
+
+            with open(self.get_path("languages.txt"), "w") as f:
+                f.write('\n'.join(k for k, v in sorted(self.e2l.items(), key=lambda kv: kv[1])))
+
+    def add_or_get_project(self, repo_id):
+        return self._add_or_get(repo_id, self.e2p)
+
+    def add_or_get_stargazer(self, user_id):
+        return self._add_or_get(user_id, self.e2s)
+
+    def add_or_get_language(self, language):
+        return self._add_or_get(language, self.e2l)
+
+    def get_path(self, filename):
+        return os.path.join(self.path, filename)
+
+    @classmethod
+    def _add_or_get(cls, external_id, store):
+        id = store.get(external_id)
+        if id is None:
+            id = len(store)
+            store[external_id] = id
+        return id
+
+def main():
+    if os.path.exists("token"):
+        token = open("token").read().strip()
+    else:
+        token = None
+    st = StarTrace(token=token)
+    st.search("Austin")
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+appdirs==1.4.3
+packaging==16.8
+pkg-resources==0.0.0
+PyGithub==1.33
+PyJWT==1.4.2
+pyparsing==2.2.0
+requests==2.13.0
+six==1.10.0