RumbleDB · ghislainfourny · Feb 2, 2022 · Feb 2, 2022 · Feb 3, 2022 · Feb 3, 2022
@@ -210,31 +210,27 @@
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_2.12</artifactId>
-            <version>3.4.2</version>
-            <scope>provided</scope>
+            <version>3.4.3</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_2.12</artifactId>
-            <version>3.4.2</version>
-            <scope>provided</scope>
+            <version>3.4.3</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-mllib_2.12</artifactId>
-            <version>3.4.2</version>
-            <scope>provided</scope>
+            <version>3.4.3</version>
         </dependency>
         <dependency>
             <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-aws</artifactId>
             <version>3.3.2</version>
-            <scope>provided</scope>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-avro_2.12</artifactId>
-            <version>3.4.2</version>
+            <version>3.4.3</version>
         </dependency>
         <dependency>
             <groupId>org.antlr</groupId>

@@ -33,6 +33,7 @@
 import org.rumbledb.exceptions.ExceptionMetadata;
 import org.rumbledb.optimizations.Profiler;
 import org.rumbledb.runtime.functions.input.FileSystemUtil;
+import org.slf4j.Logger;
 
 import sparksoniq.spark.SparkSessionManager;
 import java.io.IOException;
@@ -43,11 +44,13 @@
 import java.util.Map;
 import java.util.stream.Collectors;
 
+import org.apache.spark.internal.Logging;
 
-public class JsoniqQueryExecutor {
+public class JsoniqQueryExecutor implements Logging {
     private RumbleRuntimeConfiguration configuration;
 
     public JsoniqQueryExecutor(RumbleRuntimeConfiguration configuration) {
+        initializeLogIfNecessary(true, true);
         this.configuration = configuration;
         SparkSessionManager.COLLECT_ITEM_LIMIT = configuration.getResultSizeCap();
     }
@@ -238,4 +241,16 @@ public long runInteractive(String query, List<Item> resultList) throws IOExcepti
         return SparkSessionManager.collectRDDwithLimitWarningOnly(rdd, resultList);
     }
 
+    @Override
+    public Logger org$apache$spark$internal$Logging$$log_() {
+        // TODO Auto-generated method stub
+        return null;
+    }
+
+    @Override
+    public void org$apache$spark$internal$Logging$$log__$eq(Logger x$1) {
+        // TODO Auto-generated method stub
+
+    }
+
 }
@@ -109,7 +109,13 @@ private static void handleException(Throwable ex, boolean showErrorInfo) {
                     "⚠️  Java went out of memory."
                 );
                 System.err.println(
-                    "If running locally, try adding --driver-memory 10G (or any quantity you need) between spark-submit and the RumbleDB jar in the command line to see if it fixes the problem. If running on a cluster, --executor-memory is the way to go."
+                    "If running locally with java -jar, try adding --Xmx10g (or any quantity you need) before the RumbleDB jar in the command line to see if it fixes the problem."
+                );
+                System.err.println(
+                    "If running locally with spark-submit, try adding --driver-memory 10G (or any quantity you need) between spark-submit and the RumbleDB jar in the command line to see if it fixes the problem."
+                );
+                System.err.println(
+                    "If running on a cluster, --executor-memory should be used instead."
                 );
                 if (showErrorInfo) {
                     ex.printStackTrace();

@@ -120,6 +120,26 @@ public SparkSession getOrCreateSession() {
 
     private void setDefaultConfiguration() {
         try {
+            if (System.getProperty("hadoop.home.dir") == null) {
+                System.err.println(
+                    "[WARNING] The hadoop home directory was not set. Setting to \"/\"."
+                );
+                System.setProperty("hadoop.home.dir", "/");
+            }
+            String javaVersion = System.getProperty("java.version");
+            if (!javaVersion.startsWith("1.8") && !javaVersion.startsWith("11.")) {
+                System.err.println("[Error] RumbleDB requires Java 8 or Java 11.");
+                System.err.println("Your Java version: " + System.getProperty("java.version"));
+            }
+
+            /*
+             * System.err.println(
+             * "[INFO] Total available memory: " + (Runtime.getRuntime().maxMemory() / 1000000000) + " GB"
+             * );
+             * System.err.println(
+             * "[INFO] Total available cores: " + Runtime.getRuntime().availableProcessors()
+             * );
+             */
             this.configuration = new SparkConf();
             if (this.configuration.get("spark.app.name", "<none>").equals("<none")) {
                 LogManager.getLogger("SparkSessionManager")

@@ -1,62 +1,48 @@
 RumbleDB is a JSONiq engine that can be used both on your laptop or on a
 cluster (e.g. with Amazon EMR or Azure HDInsight).
 
-It runs on top of Apache Spark and must be invoked with spark-submit, both for
-local use and for cluster use. Spark must be installed either on your laptop,
-or on the cluster.
+This is the standalone jar that does not require the installation of Spark.
+
+If you need more control over Spark or use it on a cluster, we recommend using
+the leaner jars instead, which you can download from www.rumbledb.org.
 
 If you do not want to install Spark, then you need to use the standalone jar
 instead from www.rumbledb.org.
 
 Usage:
-spark-submit <Spark arguments> <path to RumbleDB's jar> <mode> <parameters>
+java -jar <path to RumbleDB's jar> <mode> <parameters>
 
 The first optional argument specifies the mode:
 **** run ****
 for directly running a query from an input file or (with -q) provided directly on the command line.
 
 It is the default mode.
 
-spark-submit rumbledb-1.22.0.jar run my-query.jq
-spark-submit rumbledb-1.22.0.jar run -q '1+1'
+java -jar rumbledb-1.22.0.jar run my-query.jq
+java -jar rumbledb-1.22.0.jar run -q '1+1'
 
 You can specify an output path with -o like so:
-spark-submit rumbledb-1.22.0.jar run -q '1+1' -o my-output.txt
+java -jar rumbledb-1.22.0.jar run -q '1+1' -o my-output.txt
 
 **** serve ****
 for running as an HTTP server listening on the specified port (-p) and host (-h).
 
-spark-submit rumbledb-1.22.0.jar serve -p 9090
+java -jar rumbledb-1.22.0.jar serve -p 9090
 
 RumbleDB also supports Apache Livy for use in Jupyter notebooks, which may be
 even more convenient if you are using a cluster.
 
 **** repl ****
 for shell mode.
 
-spark-submit rumbledb-1.22.0.jar repl
+java -jar rumbledb-1.22.0.jar repl
 
 
 **** resource use configuration ****
 
 For a local use, you can control the number of cores, as well as allocated
 memory, with:
-spark-submit --master local[*] rumbledb-1.22.0.jar repl
-spark-submit --master local[*] rumbledb-1.22.0.jar repl
-spark-submit --master local[2] rumbledb-1.22.0.jar repl
-spark-submit --master local[*] --driver-memory 10G rumbledb-1.22.0.jar repl
-
-You can use RumbleDB remotely with:
-spark-submit --master yarn rumbledb-1.22.0.jar repl
-
-(Although for clusters provided as a service, --master yarn is often implicit
-and unnecessary).
-
-For remote use (e.g., logged in on the Spark cluster with ssh), you can set the
-number of executors, cores and memory, you can use:
-spark-submit --executor-cores 3 --executor-memory 5G rumbledb-1.22.0.jar repl
 
-For remote use, you can also use other file system paths such as S3, HDFS, etc:
-spark-submit rumbledb-1.22.0.jar run hdfs://server:port/my-query.jq -o hdfs://server:port/my-output.json
+java -jar -Xmx10g rumbledb-1.22.0.jar repl
 
 More documentation on available CLI parameters is available on https://www.rumbledb.org/